In [None]:
# Install packages required for plotly rendering in notebooks
!pip install -q --upgrade jupyter "ipywidgets>=7.6.0" "nbformat>=4.2.0"

In [None]:
from pathlib import Path
import pandas as pd

p = Path("datasets") / "train.parquet"
df = pd.read_parquet(p)

df.head()

In [None]:
df

In [None]:
import plotly.express as px

# Select the first sequence from the dataframe
seq_id_to_plot = 0
df_seq = df[df['seq_ix'] == seq_id_to_plot]

# Get the list of feature columns (f_0 to f_31)
feature_cols = [f'{i}' for i in range(32)]

# Generate a separate plot for each feature
for feature in feature_cols:
    fig = px.line(df_seq, x='step_in_seq', y=feature, title=f'{feature} for Sequence {seq_id_to_plot}')
    fig.show()


In [None]:
!pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select the first sequence from the dataframe
seq_id_to_plot = 1
feature_cols = [f'{i}' for i in range(32)]
df_seq = df[df['seq_ix'] == seq_id_to_plot]

# Compute the correlation matrix for all features in the sequence
corr_matrix = df_seq[feature_cols].corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', xticklabels=feature_cols, yticklabels=feature_cols)
plt.title(f'Correlation Matrix for Features of Sequence {seq_id_to_plot}')
plt.show()


mean

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Select a sequence to plot
seq_id_to_plot = 0
feature_cols = [f'{i}' for i in range(32)]
df_seq = df[df['seq_ix'] == seq_id_to_plot]

window = 10  # window size for SMA/EMA
alpha = 0.2  # smoothing factor for EMA

# Create subplot grid: 8 rows x 4 columns
fig = make_subplots(rows=8, cols=4, subplot_titles=[f'Feature {f}' for f in feature_cols], shared_xaxes=True)

for idx, feature in enumerate(feature_cols):
    values = df_seq[feature].values
    sma = pd.Series(values).rolling(window=window, min_periods=1).mean().values
    ema = pd.Series(values).ewm(alpha=alpha, adjust=False).mean().values
    row = idx // 4 + 1
    col = idx % 4 + 1
    fig.add_trace(go.Scatter(x=df_seq['step_in_seq'], y=values, mode='lines', name='Value', line=dict(color='gray', width=1), opacity=0.6, showlegend=(idx==0)), row=row, col=col)
    fig.add_trace(go.Scatter(x=df_seq['step_in_seq'], y=sma, mode='lines', name='SMA', line=dict(color='blue', width=2), showlegend=(idx==0)), row=row, col=col)
    fig.add_trace(go.Scatter(x=df_seq['step_in_seq'], y=ema, mode='lines', name='EMA', line=dict(color='red', width=2), showlegend=(idx==0)), row=row, col=col)

fig.update_layout(height=2000, width=1600, title_text=f'Sequence {seq_id_to_plot}: Value, SMA, and EMA for All Features', legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1))
fig.update_xaxes(title_text='step_in_seq', row=8)
fig.update_yaxes(title_text='Value', col=1)
fig.show()

fft and x(t) vs x(t+1)-x(t)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go  # Added Plotly
from plotly.offline import plot      # Added Plotly offline plot
from scipy.stats import pearsonr
import warnings

# Suppress warnings from plotting
warnings.filterwarnings("ignore")

print("Loading data...")
# Load the dataset
try:
    df = df
except Exception as e:
    print(f"Error loading data: {e}")
    print("Using a dummy dataset for demonstration.")
    # Create a dummy dataset if loading fails
    num_sequences = 10
    num_steps = 1000
    num_features = 5
    data = []
    for seq_id in range(num_sequences):
        for step in range(num_steps):
            row = {'seq_ix': seq_id, 'step_in_seq': step, 'need_prediction': step >= 100}
            row.update({f'feature_{i}': np.sin(step / (50 + i*10)) + np.random.randn()*0.1 for i in range(num_features)})
            data.append(row)
    df = pd.DataFrame(data)

feature_cols = [c for c in df.columns if c not in ("seq_ix", "step_in_seq", "need_prediction")]
num_features = len(feature_cols)

# --- Isolate one sequence for analysis ---
sample_seq_ix = df['seq_ix'].unique()[3]
one_seq_df = df[df['seq_ix'] == sample_seq_ix].sort_values('step_in_seq')
states = one_seq_df[feature_cols].to_numpy(dtype=np.float32)
print(f"Analyzing sample sequence {sample_seq_ix} with shape {states.shape}")

# === 1. Compute FFT for each column ===
print("\n--- 1. FFT Analysis ---")
N_T = states.shape[0] # Number of timesteps (1000)
frequencies = np.fft.fftfreq(N_T, d=1) # Assumes 1 unit of time per step
num_features_to_plot = min(5, num_features)

# --- Matplotlib FFT Plot ---
plt.figure(figsize=(12, 7))
for i in range(num_features_to_plot):
    fft_values = np.fft.fft(states[:, i])
    fft_power = np.abs(fft_values)
    plt.plot(frequencies[:N_T // 2], fft_power[:N_T // 2], label=f'Feature {i}', alpha=0.7)

plt.title('FFT Power Spectrum (First 5 Features) - Matplotlib', fontsize=16)
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Magnitude', fontsize=12)
plt.legend()
plt.grid(True)
plt.savefig('fft_spectrum.png')
print("Saved Matplotlib FFT plot to fft_spectrum.png")

# --- Plotly FFT Plot (New) ---
fig = go.Figure()

for i in range(num_features_to_plot):
    fft_values = np.fft.fft(states[:, i])
    fft_power = np.abs(fft_values)
    
    fig.add_trace(go.Scatter(
        x=frequencies[:N_T // 2], 
        y=fft_power[:N_T // 2], 
        mode='lines',
        name=f'Feature {i}'
    ))

fig.update_layout(
    title='Interactive FFT Power Spectrum (First 5 Features) - Plotly',
    xaxis_title='Frequency',
    yaxis_title='Magnitude',
    hovermode="x unified",
    legend_title="Features"
)
plot(fig, filename='fft_spectrum.html', auto_open=False)
print("Saved Plotly FFT plot to fft_spectrum.html")


# === 2. Calculate correlations of x(t) with x(t+1) - x(t) ===
print("\n--- 2. x(t) vs. Delta Correlation Analysis ---")

x_t = states[:-1]
x_t_plus_1 = states[1:]
delta = x_t_plus_1

correlations = []
for i in range(num_features):
    corr, _ = pearsonr(x_t[:, i], delta[:, i])
    correlations.append(corr)

correlations = np.array(correlations)

print(f"Mean correlation: {np.nanmean(correlations):.4f}")
print(f"Min correlation:  {np.nanmin(correlations):.4f}")
print(f"Max correlation:  {np.nanmax(correlations):.4f}")

mean_r_squared = np.nanmean(correlations**2)
print(f"\nMean R-squared (as a linear predictor): {mean_r_squared:.6f}")


# === 3. Plot x(t) with x(t+1) - x(t) ===
print("\n--- 3. Scatter Plot Analysis ---")
plt.clf() # Clear the previous FFT plot

feature_to_plot = 0
x_data = x_t[:, feature_to_plot]
y_data = delta[:, feature_to_plot]

plt.figure(figsize=(10, 8))
plt.scatter(x_data, y_data, alpha=0.1, s=10)
plt.title(f'State vs. Next Change (Feature {feature_to_plot}, Seq {sample_seq_ix})', fontsize=16)
plt.xlabel(f'x(t) [State of Feature {feature_to_plot}]', fontsize=12)
plt.ylabel(f'x(t+100) - x(t) [Next Delta of Feature {feature_to_plot}]', fontsize=12)
plt.grid(True)
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.axvline(0, color='red', linestyle='--', linewidth=1)
plt.savefig('xt_vs_delta_scatter.png')
print("Saved scatter plot to xt_vs_delta_scatter.png")
print("\nAnalysis complete.")

In [None]:
sum1=0
sum2=0
n_seq = df['seq_ix'].nunique()
for i in range(len(df)):
    if df['need_prediction'][i]==1:
        sum1+=1
    else:
        sum2+=1
print(f'need pred: {sum1} \n do not need pred: {sum2}.')
print(f'avg need pred: {sum1/n_seq} \n do not need pred: {sum2/n_seq}.')

In [None]:
import plotly.graph_objs as go
import numpy as np
import pandas as pd

# Parameters: choose sequence and feature to analyze
seq_id_to_plot = 515  # Change as needed
feature_to_plot = '14'  # Change as needed (e.g., '0', '1', ..., '31')
max_lag = 100  # Number of lags to compute


# Extract the sequence and feature values
df_seq = df[df['seq_ix'] == seq_id_to_plot].sort_values('step_in_seq')
values = df_seq[feature_to_plot].values

# Compute correlations for each lag
correlations = []
lags = np.arange(1, max_lag + 1)
for lag in lags:
    if len(values) > lag:
        # Calculate correlation between x(t) and x(t-lag)
        corr = np.corrcoef(values[lag:], values[:-lag])[0, 1]
        correlations.append(corr)
    else:
        correlations.append(np.nan)

# Plot using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=lags, y=correlations, mode='lines+markers', name=f'Feature {feature_to_plot}'))
fig.update_layout(title=f'Autocorrelation of Feature {feature_to_plot} in Sequence {seq_id_to_plot}',
                  xaxis_title='Lag', yaxis_title='Correlation',
                  height=400, width=700)
fig.show()

### 1. Feature Distribution and Outlier Analysis

This analysis visualizes the distribution of each of the 32 features across the entire dataset. 

- **Histograms**: Show the frequency of values in different bins, helping to identify the shape of the distribution (e.g., Gaussian, skewed, bimodal).
- **Box Plots**: Display the five-number summary of a set of data: minimum, first quartile, median, third quartile, and maximum. They are useful for quickly identifying the data's spread and detecting outliers.

The following cell will generate these plots for all features. You can hover over the interactive plots to see detailed values.

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

feature_cols = [f'{i}' for i in range(32)]

# Create subplots: 1 row for histograms, 1 for box plots, across all 32 features
# To make it manageable, we'll create 4 rows of plots, 8 features per row.
fig = make_subplots(
    rows=8, cols=4, 
    subplot_titles=[f'Feature {f}' for f in feature_cols],
    specs=[[{'type': 'histogram'}, {'type': 'histogram'}, {'type': 'histogram'}, {'type': 'histogram'}] for _ in range(8)]
)

# Add histograms
for i, feature in enumerate(feature_cols):
    row = i // 4 + 1
    col = i % 4 + 1
    fig.add_trace(go.Histogram(x=df[feature], name=f'Hist {feature}', nbinsx=100), row=row, col=col)

fig.update_layout(
    height=2000, 
    width=1600, 
    title_text='Feature Distributions (Histograms)',
    showlegend=False
)
fig.show()

# Create box plots in a separate figure for clarity
fig_box = make_subplots(
    rows=4, cols=8, 
    subplot_titles=[f'Feature {f}' for f in feature_cols]
)

for i, feature in enumerate(feature_cols):
    row = i // 8 + 1
    col = i % 8 + 1
    fig_box.add_trace(go.Box(y=df[feature], name=f'Box {feature}'), row=row, col=col)

fig_box.update_layout(
    height=1000, 
    width=1600, 
    title_text='Feature Spread and Outliers (Box Plots)',
    showlegend=False
)
fig_box.show()

### 2. Cross-Correlation Analysis

Cross-correlation measures the similarity between two time series as a function of the displacement of one relative to the other. It helps to identify if one feature is a leading or lagging indicator of another.

The plot below shows the correlation between `feature_a` and `feature_b` at different time lags.
- A peak at a **positive lag** means `feature_a` leads `feature_b`.
- A peak at a **negative lag** means `feature_b` leads `feature_a`.
- A peak at **lag 0** indicates a simultaneous correlation.

You can modify the `feature_a`, `feature_b`, `seq_id_to_analyze`, and `max_lag` parameters in the next cell to explore different relationships.

In [None]:
import numpy as np
import plotly.graph_objects as go

# --- Parameters ---
feature_a = '0'
feature_b = '1'
seq_id_to_analyze = 0
max_lag = 50

# --- Data Extraction ---
df_seq = df[df['seq_ix'] == seq_id_to_analyze].sort_values('step_in_seq')
values_a = df_seq[feature_a].values
values_b = df_seq[feature_b].values

# --- Cross-Correlation Calculation ---
# Standardize the series (z-score normalization)
values_a = (values_a - np.mean(values_a)) / (np.std(values_a) * len(values_a))
values_b = (values_b - np.mean(values_b)) / np.std(values_b)

lags = np.arange(-max_lag, max_lag + 1)
corrs = [np.correlate(values_a, np.roll(values_b, lag))[0] for lag in lags]

# --- Plotting ---
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=lags, 
    y=corrs, 
    mode='lines+markers',
    name=f'Cross-corr F{feature_a} vs F{feature_b}'
))
fig.update_layout(
    title=f'Cross-Correlation between Feature {feature_a} and Feature {feature_b} (Seq {seq_id_to_analyze})',
    xaxis_title='Lag (Displacement of Feature B relative to A)',
    yaxis_title='Correlation',
    height=400,
    width=800
)
fig.show()

### 3. Volatility and Change Analysis

This section examines the stability of the features by looking at two key metrics:

- **First-Order Difference (Delta)**: Calculated as `x(t) - x(t-1)`, this shows the step-by-step change in a feature's value. A histogram of these deltas can reveal if the changes are typically small and centered around zero, or if there are frequent large jumps.
- **Rolling Standard Deviation**: This measures the volatility of a feature over a sliding window of time. Peaks in the rolling standard deviation indicate periods of high fluctuation.

You can adjust the `feature_to_analyze`, `seq_id_to_analyze`, and `rolling_window_size` in the next cell.

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Parameters ---
feature_to_analyze = '0'
seq_id_to_analyze = 0
rolling_window_size = 50

# --- Data Extraction ---
df_seq = df[df['seq_ix'] == seq_id_to_analyze].sort_values('step_in_seq').copy()
values = df_seq[feature_to_analyze]

# --- Calculations ---
# First-order difference
df_seq['delta'] = values.diff()
# Rolling standard deviation
df_seq['volatility'] = values.rolling(window=rolling_window_size).std()

# --- Plotting ---
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, subplot_titles=(
    f'Feature {feature_to_analyze} Values',
    'First-Order Difference (Delta)',
    f'Rolling {rolling_window_size}-Step Volatility'
))

# Plot original feature values
fig.add_trace(go.Scatter(x=df_seq['step_in_seq'], y=df_seq[feature_to_analyze], mode='lines', name='Value'), row=1, col=1)
# Plot deltas
fig.add_trace(go.Scatter(x=df_seq['step_in_seq'], y=df_seq['delta'], mode='lines', name='Delta', line=dict(color='orange')), row=2, col=1)
# Plot volatility
fig.add_trace(go.Scatter(x=df_seq['step_in_seq'], y=df_seq['volatility'], mode='lines', name='Volatility', line=dict(color='red')), row=3, col=1)

fig.update_layout(
    height=800, 
    width=1200, 
    title_text=f'Volatility Analysis for Feature {feature_to_analyze} in Sequence {seq_id_to_analyze}',
    showlegend=False
)
fig.show()

# Histogram of Deltas
fig_hist = go.Figure()
fig_hist.add_trace(go.Histogram(x=df_seq['delta'].dropna(), nbinsx=100))
fig_hist.update_layout(
    title=f'Distribution of Deltas for Feature {feature_to_analyze} (Seq {seq_id_to_analyze})',
    xaxis_title='Delta (x(t) - x(t-1))',
    yaxis_title='Frequency',
    height=400,
    width=800
)
fig_hist.show()

### 4. Sequence-Level Clustering

This analysis aims to discover if there are distinct "types" of sequences in the dataset. The process is as follows:

1.  **Feature Engineering**: For each sequence, a set of summary statistics (mean, std, min, max, etc.) is calculated for each of the 32 features. This creates a single "summary" vector that represents the overall behavior of each sequence.
2.  **Dimensionality Reduction**: The high-dimensional summary vectors are scaled and then reduced to 2 dimensions using Principal Component Analysis (PCA). This allows us to visualize the sequences in a 2D scatter plot.
3.  **Clustering**: The K-Means algorithm is applied to the PCA-transformed data to group the sequences into a predefined number of clusters.

The resulting plot shows each sequence as a point, colored by its assigned cluster. This can reveal natural groupings and help in understanding the different dynamic regimes present in the data.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px

# --- Parameters ---
n_clusters = 5  # You can experiment with this value

# --- Feature Engineering ---
feature_cols = [f'{i}' for i in range(32)]
# Calculate aggregate statistics for each sequence
agg_funcs = ['mean', 'std', 'min', 'max', 'median']
seq_summary = df.groupby('seq_ix')[feature_cols].agg(agg_funcs)

# Flatten the multi-level column names
seq_summary.columns = ['_'.join(col).strip() for col in seq_summary.columns.values]
seq_summary.dropna(inplace=True) # Drop sequences that might have NaNs (e.g., if std is zero)

# --- Scaling and PCA ---
scaler = StandardScaler()
summary_scaled = scaler.fit_transform(seq_summary)

pca = PCA(n_components=2)
summary_pca = pca.fit_transform(summary_scaled)

# --- K-Means Clustering ---
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(summary_pca)

# --- Visualization ---
pca_df = pd.DataFrame(summary_pca, columns=['PC1', 'PC2'])
pca_df['cluster'] = clusters
pca_df['seq_ix'] = seq_summary.index

fig = px.scatter(
    pca_df, 
    x='PC1', 
    y='PC2', 
    color='cluster',
    hover_data=['seq_ix'],
    title=f'Sequence-Level Clustering (K={n_clusters})',
    labels={'color': 'Cluster ID'}
)
fig.update_layout(
    height=600,
    width=1000
)
fig.show()

### 5. Principal Component Analysis (PCA)

PCA is a technique used to emphasize variation and bring out strong patterns in a dataset. It's often used for dimensionality reduction.

1.  **Explained Variance**: This plot shows the percentage of the total variance in the dataset that is captured by each principal component. It helps to determine how many components are needed to represent a significant portion of the data's structure.
2.  **PCA Scatter Plot**: This plot visualizes the entire dataset in a 2D space defined by the first two principal components. We can color the points by a specific feature or by the `need_prediction` flag to see if these components separate the data in a meaningful way.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
import numpy as np

# --- Data Preparation ---
feature_cols = [f'{i}' for i in range(32)]
# We'll use a sample of the data to keep the plot from getting too crowded
df_sample = df.sample(n=min(50000, len(df)), random_state=42)

X = df_sample[feature_cols]
y = df_sample['need_prediction']

# --- Scaling and PCA ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# --- Explained Variance Plot ---
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

fig_var = go.Figure()
fig_var.add_trace(go.Bar(x=np.arange(1, len(explained_variance) + 1), y=explained_variance, name='Individual Explained Variance'))
fig_var.add_trace(go.Scatter(x=np.arange(1, len(cumulative_variance) + 1), y=cumulative_variance, name='Cumulative Explained Variance'))

fig_var.update_layout(
    title='Explained Variance by Principal Components',
    xaxis_title='Principal Component',
    yaxis_title='Explained Variance Ratio',
    height=500,
    width=1000
)
fig_var.show()

# --- PCA Scatter Plot ---
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
pca_df['need_prediction'] = y.values

fig_scatter = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='need_prediction',
    title='PCA of Features (colored by need_prediction)',
    opacity=0.5
)
fig_scatter.update_layout(
    height=600,
    width=1000
)

fig_scatter.show()

### 6. Comparing Data Segments (`need_prediction` == True vs. False)

This analysis investigates whether the distribution of the features changes between the time steps where we have historical data (`need_prediction = False`) and the time steps where we need to make predictions (`need_prediction = True`).

A significant difference in the distributions could imply that the underlying process is non-stationary, and a model trained on the first part of the sequence might not perform well on the second part.

**Violin plots** are used for this comparison. They are similar to box plots but also show the probability density of the data at different values. This provides a more detailed view of the distribution's shape.

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

feature_cols = [f'{i}' for i in range(32)]

# Create subplots
fig = make_subplots(
    rows=8, cols=4, 
    subplot_titles=[f'Feature {f}' for f in feature_cols]
)

for i, feature in enumerate(feature_cols):
    row = i // 4 + 1
    col = i % 4 + 1
    
    # Add violin plot for need_prediction = False
    fig.add_trace(go.Violin(
        y=df[df['need_prediction'] == False][feature],
        name='False',
        legendgroup='group1',
        scalegroup='group1',
        side='negative',
        box_visible=True,
        meanline_visible=True,
        line_color='blue',
        showlegend=(i==0)
    ), row=row, col=col)
    
    # Add violin plot for need_prediction = True
    fig.add_trace(go.Violin(
        y=df[df['need_prediction'] == True][feature],
        name='True',
        legendgroup='group2',
        scalegroup='group2',
        side='positive',
        box_visible=True,
        meanline_visible=True,
        line_color='orange',
        showlegend=(i==0)
    ), row=row, col=col)

fig.update_traces(meanline_visible=True)
fig.update_layout(
    height=2000, 
    width=1600, 
    title_text='Feature Distributions by need_prediction Flag',
    violingap=0, 
    violingroupgap=0, 
    violinmode='overlay',
    legend_title="need_prediction"
)
fig.show()

In [None]:
# Count the total number of unique sequences
total_sequences = df['seq_ix'].nunique()
print(f"Total number of sequences: {total_sequences}")