## Importing packages

In [1]:
import pandas as pd 
import numpy as np
import plotly.express as px
import pyarrow.parquet as pq
import plotly.graph_objects as go
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

#### Functions used

In [2]:
def load_parquet_sampled(file_path, step=100, chunk_size=100_000):
    """
    Reads a large Parquet file in chunks to prevent ArrowMemoryError,
    extracting only every Nth row.
    """
    print(f"Opening {file_path} for chunked reading...")
    
    # Open the parquet file without loading the data into RAM
    parquet_file = pq.ParquetFile(file_path)
    
    filtered_chunks = []
    
    # Iterate through the file in manageable batches
    for i, batch in enumerate(parquet_file.iter_batches(batch_size=chunk_size)):
        # Convert ONLY this small chunk to a pandas DataFrame
        df_chunk = batch.to_pandas()
        
        # Slice every 100th row from this chunk
        df_sampled = df_chunk.iloc[::step]
        
        filtered_chunks.append(df_sampled)
        

    # Stitch the tiny, filtered chunks back into one final DataFrame
    print("Concatenating filtered chunks...")
    final_df = pd.concat(filtered_chunks, ignore_index=True)
    
    return final_df

def label_degradation_transitions(df, column='POS_FBK', threshold=0.1):
    """
    Labels the two cycles involved in a degradation step as 1, and the rest as 0.
    Must be applied to a dataframe that contains a 'Train_ID' column.
    """
    
    # 1. Create the monotonic envelope per train
    df['Envelope'] = df.groupby('Train_ID')[column].cummin()
    
    # 2. Calculate the difference per train
    df['Envelope_Change'] = df.groupby('Train_ID')['Envelope'].diff()
    
    # 3. Boolean mask for the cycle where the drop is registered (Cycle B)
    drop_cycle_B = df['Envelope_Change'] < -threshold
    
    # 4. Boolean mask for the cycle immediately before the drop (Cycle A)
    # By using fill_value=False directly inside shift, we avoid the fillna warning completely
    drop_cycle_A = drop_cycle_B.groupby(df['Train_ID']).shift(-1, fill_value=False)
    
    # 5. Combine both masks with an OR (|) condition
    df['FDI'] = (drop_cycle_A | drop_cycle_B).astype(int)
    
    # Clean up intermediate columns
    df = df.drop(columns=['Envelope', 'Envelope_Change'])

    return df

def plot_line_scatter_FDI(df_1, df_2, x_1, x_2, y):
    fig = px.line(
        df_1, 
        x=x_1, 
        y=y, 
        color='Train_ID',
        hover_data=['Train_ID', 'Cycle'],
        title="Door Position with Degradation Events (FDI=1)"
    )

    # 3. Add the FDI=1 points on top of the base plot
    # Note: I highly recommend using 'markers' instead of 'lines' here. 
    # Since FDI=1 happens at isolated moments, drawing a continuous line between them 
    # can make the plot look messy and misrepresent the timeline.
    fig.add_trace(
        go.Scatter(
            x=x_2,
            y=df_2[y],
            mode='markers', # Use 'lines' here if you strictly want a curve
            marker=dict(color='red', size=8, symbol='circle-dot'),
            name='Degradation (FDI=1)',

            # This replicates your hover_data for the Graph Object
            customdata=df_2[['Train_ID', 'Cycle']],
            hovertemplate="Train_ID: %{customdata[0]}<br>Cycle: %{customdata[1]}<br>POS_FBK: %{y}<extra></extra>"
        )
    )

    # 4. Show the combined plot
    fig.show()
    
def cluster_trajectories_gmm(df, n_clusters=3):
    """
    Extracts features from the FDI column and uses GMM to cluster the training runs.
    """
    # 1. Feature Extraction per Train
    feature_list = []
    
    for train_id, group in df.groupby('Train_ID'):
        total_cycles = group['Cycle'].max()
        
        # Calculate degradation metrics using the FDI column
        degradation_events = group[group['FDI'] == 1]
        num_shocks = len(degradation_events)/2
        
        # Find when the first shock happened (if none, default to total cycles)
        if num_shocks > 0:
            first_shock_cycle = degradation_events['Cycle'].min()
        else:
            first_shock_cycle = total_cycles
            
        feature_list.append({
            'Train_ID': train_id,
            'Total_Cycles': total_cycles,
            'Num_Shocks': num_shocks,
            'First_Shock_Cycle': first_shock_cycle
        })
        
    features_df = pd.DataFrame(feature_list)
    
    # 2. Data Scaling
    # GMM is sensitive to scale. Total_Cycles (e.g., 3000) will dominate Num_Shocks (e.g., 5) if not scaled.
    features_to_cluster = ['Total_Cycles', 'Num_Shocks', 'First_Shock_Cycle']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features_df[features_to_cluster]) ### NEED TO CHECK IF THIS IS AN APPROPRIATE SCALER
    
    # 3. Apply Gaussian Mixture Model
    # We choose 3 components because the challenge dataset has 3 operating conditions
    gmm = GaussianMixture(n_components=n_clusters, random_state=42, covariance_type='full') ### CHECK THESE CHOICES
    
    # Fit and predict the clusters
    features_df['GMM_Cluster'] = gmm.fit_predict(X_scaled)
    
    # Convert cluster integers to strings so Plotly treats them as discrete categories
    features_df['GMM_Cluster'] = 'Cluster ' + features_df['GMM_Cluster'].astype(str)
    
    return features_df

def find_optimal_gmm_clusters(features_df, max_clusters=10):
    """
    Tests GMM with different numbers of clusters and plots the AIC/BIC scores
    to help you find the optimal number of groups.
    """
    # 1. Scale the data (just like before)
    features_to_cluster = ['Total_Cycles', 'Num_Shocks', 'First_Shock_Cycle']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features_df[features_to_cluster])
    
    # Lists to store the scores
    n_components_range = range(1, max_clusters + 1)
    aic_scores = []
    bic_scores = []
    
    # 2. Train a new GMM for every number of clusters and record the scores
    for n in n_components_range:
        gmm = GaussianMixture(n_components=n, random_state=42, covariance_type='full')
        gmm.fit(X_scaled)
        
        aic_scores.append(gmm.aic(X_scaled))
        bic_scores.append(gmm.bic(X_scaled))
        
    # 3. Plot the BIC and AIC scores using Plotly
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=list(n_components_range), y=bic_scores, 
                             mode='lines+markers', name='BIC Score',
                             line=dict(color='blue', width=2)))
                             
    fig.add_trace(go.Scatter(x=list(n_components_range), y=aic_scores, 
                             mode='lines+markers', name='AIC Score',
                             line=dict(color='red', width=2)))
                             
    fig.update_layout(
        title='GMM Model Selection (Lower Score is Better)',
        xaxis_title='Number of Clusters',
        yaxis_title='Information Criterion Score',
        xaxis=dict(tickmode='linear', tick0=1, dtick=1) # Force x-axis to show integers
    )
    
    fig.show()



### Importing dataset

In [3]:
df_opening_all = load_parquet_sampled('../opening_dataset.parquet', step=1)

Opening ../opening_dataset.parquet for chunked reading...
Concatenating filtered chunks...


In [4]:
print(df_opening_all.shape)
df_opening_all.columns

(57789000, 18)


Index(['Time', 'POS_REF', 'POS_FBK', 'VEL_REF', 'VEL_FBK', 'FBK_DIGHALL',
       'FBK_DIGENC1', 'DRV_PROT_VBUS', 'MOT_PROT_TEMP', 'FBK_CUR_A',
       'FBK_CUR_B', 'FBK_CUR_C', 'DRV_PROT_TEMP', 'FBK_VOL_A', 'FBK_VOL_B',
       'FBK_VOL_C', 'Train_ID', 'Cycle'],
      dtype='object')

In [5]:
df_opening_all.head()

Unnamed: 0,Time,POS_REF,POS_FBK,VEL_REF,VEL_FBK,FBK_DIGHALL,FBK_DIGENC1,DRV_PROT_VBUS,MOT_PROT_TEMP,FBK_CUR_A,FBK_CUR_B,FBK_CUR_C,DRV_PROT_TEMP,FBK_VOL_A,FBK_VOL_B,FBK_VOL_C,Train_ID,Cycle
0,-0.0005,190.0,190.0,0.0,-0.0,5.0,0.0,25.76,0.0,0.04,-2.08,1.95,28.56,12.88,12.37,13.38,Train_1,1
1,0.0,190.0,190.0,0.0,-0.0,5.0,0.0,25.73,0.0,-0.01,-2.02,2.03,28.56,12.87,12.19,13.54,Train_1,1
2,0.0005,190.0,190.0,0.0,-0.0,5.0,0.0,25.74,0.0,0.01,-2.04,2.02,28.5,12.87,12.23,13.51,Train_1,1
3,0.001,190.0,190.0,0.0,-0.0,5.0,0.0,25.74,0.0,-0.01,-2.02,2.02,28.56,12.87,12.17,13.57,Train_1,1
4,0.0015,190.0,190.0,0.0,-0.0,5.0,0.0,25.74,0.0,0.01,-2.03,2.01,28.5,12.87,12.21,13.53,Train_1,1


In [6]:
# Creating the FDI
df_opening_all = label_degradation_transitions(df_opening_all, column='POS_FBK', threshold=1)
df_opening_all[['FDI', 'Train_ID']].value_counts()


FDI  Train_ID
0    Train_15    2417348
     Train_4     2216344
     Train_25    2177968
     Train_9     2019564
     Train_20    1909764
                  ...   
1    Train_8          22
     Train_3          20
     Train_24         18
     Train_6          16
     Train_30          2
Name: count, Length: 96, dtype: int64

In [7]:
# Train_ID --> Train_Number column, Train_1 --> 1
df_opening_all['Train_Number'] = df_opening_all['Train_ID'].str.split('_').str[-1].astype(int)

# Getting the maximum position of each cycle (600 lines per cycle)
group_index = np.arange(len(df_opening_all)) // 600
df_opening_all['Max_POS_per_Cycle'] = df_opening_all.groupby(group_index)['POS_FBK'].transform('max')

In [None]:

# 1. Separate your data into two clean variables
df_full_signal = df_opening_all[df_opening_all['Train_Number'] < 2][::300]
df_full_signal = df_opening_all[(df_opening_all['Train_Number'] == 2) & (df_opening_all['FDI'] == 1)]
df_fdi_events = df_opening_all[(df_opening_all['Train_Number'] == 2) & (df_opening_all['FDI'] == 1)]

# 2. Create the base plot (the full downsampled signal)
plot_line_scatter_FDI(df_full_signal, df_fdi_events, df_full_signal.index, df_fdi_events.index,'POS_FBK')


In [9]:
stop

NameError: name 'stop' is not defined

In [None]:
def cluster_trajectories_gmm(df, n_clusters=3):
    """
    Extracts features from the FDI column and uses GMM to cluster the training runs.
    """
    # 1. Feature Extraction per Train
    feature_list = []
    
    for train_id, group in df.groupby('Train_ID'):
        total_cycles = group['Cycle'].max()
        
        # Calculate degradation metrics using the FDI column
        degradation_events = group[group['FDI'] == 1]
        num_shocks = len(degradation_events)/2
        
        # Find when the first shock happened (if none, default to total cycles)
        if num_shocks > 0:
            first_shock_cycle = degradation_events['Cycle'].min()
        else:
            first_shock_cycle = total_cycles
            
        feature_list.append({
            'Train_ID': train_id,
            'Total_Cycles': total_cycles,
            'Num_Shocks': num_shocks,
            'First_Shock_Cycle': first_shock_cycle
        })
        
    features_df = pd.DataFrame(feature_list)
    
    # 2. Data Scaling
    # GMM is sensitive to scale. Total_Cycles (e.g., 3000) will dominate Num_Shocks (e.g., 5) if not scaled.
    features_to_cluster = ['Total_Cycles', 'Num_Shocks', 'First_Shock_Cycle']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features_df[features_to_cluster]) ### NEED TO CHECK IF THIS IS AN APPROPRIATE SCALER
    
    # 3. Apply Gaussian Mixture Model
    # We choose 3 components because the challenge dataset has 3 operating conditions
    gmm = GaussianMixture(n_components=n_clusters, random_state=42, covariance_type='full') ### CHECK THESE CHOICES
    
    # Fit and predict the clusters
    features_df['GMM_Cluster'] = gmm.fit_predict(X_scaled)
    
    # Convert cluster integers to strings so Plotly treats them as discrete categories
    features_df['GMM_Cluster'] = 'Cluster ' + features_df['GMM_Cluster'].astype(str)
    
    return features_df

def find_optimal_gmm_clusters(features_df, max_clusters=10):
    """
    Tests GMM with different numbers of clusters and plots the AIC/BIC scores
    to help you find the optimal number of groups.
    """
    # 1. Scale the data (just like before)
    features_to_cluster = ['Total_Cycles', 'Num_Shocks', 'First_Shock_Cycle']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features_df[features_to_cluster])
    
    # Lists to store the scores
    n_components_range = range(1, max_clusters + 1)
    aic_scores = []
    bic_scores = []
    
    # 2. Train a new GMM for every number of clusters and record the scores
    for n in n_components_range:
        gmm = GaussianMixture(n_components=n, random_state=42, covariance_type='full')
        gmm.fit(X_scaled)
        
        aic_scores.append(gmm.aic(X_scaled))
        bic_scores.append(gmm.bic(X_scaled))
        
    # 3. Plot the BIC and AIC scores using Plotly
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=list(n_components_range), y=bic_scores, 
                             mode='lines+markers', name='BIC Score',
                             line=dict(color='blue', width=2)))
                             
    fig.add_trace(go.Scatter(x=list(n_components_range), y=aic_scores, 
                             mode='lines+markers', name='AIC Score',
                             line=dict(color='red', width=2)))
                             
    fig.update_layout(
        title='GMM Model Selection (Lower Score is Better)',
        xaxis_title='Number of Clusters',
        yaxis_title='Information Criterion Score',
        xaxis=dict(tickmode='linear', tick0=1, dtick=1) # Force x-axis to show integers
    )
    
    fig.show()



In [None]:
find_optimal_gmm_clusters(df_opening_all, max_clusters=10)

In [None]:
clustered_features = cluster_trajectories_gmm(df_opening_all, n_clusters=3)

# 1. Visualize the Clusters in a 3D Scatter Plot
fig_scatter = px.scatter_3d(
    clustered_features, 
    x='Total_Cycles', 
    y='Num_Shocks', 
    z='First_Shock_Cycle',
    color='GMM_Cluster',
    hover_data=['Train_ID'],
    title='GMM Clustering of Degradation Profiles'
)
fig_scatter.show()

# 2. Map the clusters back to the original time-series data to plot the curves
# We merge the new 'GMM_Cluster' column back into the main dataset
df_opening_all = df_opening_all.merge(clustered_features[['Train_ID', 'GMM_Cluster']], on='Train_ID')

fig_lines = px.line(
    df_opening_all, 
    x='Cycle', 
    y='POS_FBK', 
    color='GMM_Cluster',
    line_group='Train_ID', # Ensures each train gets its own line, but colored by cluster
    title='Degradation Trajectories Colored by GMM Cluster'
)
fig_lines.show()