In [9]:
# Main import block 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import random

In [21]:
try:
    train_df = pd.read_csv('train_motion_data.csv')
    test_df = pd.read_csv('test_motion_data.csv')
    print("Train and test data loaded successfully.")
    print(f"Training data shape: {train_df.shape}")
    print(f"Testing data shape: {test_df.shape}\n")
except FileNotFoundError:
    print("Error: Make sure 'train_motion_data.csv' and 'test_motion_data.csv' are in the same directory.")
    exit()

train_df

print(f"Original shape: {train_df.shape}")

Train and test data loaded successfully.
Training data shape: (3644, 8)
Testing data shape: (3084, 8)

Original shape: (3644, 8)


In [None]:
# New approach 

def create_time_series(df, window_size):
    """
    Creates time-series features based on a rolling window.
    """
    df = df.copy()

    # Creates magnitude vector for both acc and mag, to account for absolute changes 
    df['Acc_Mag'] = np.sqrt(df['AccX']**2 + df['AccY']**2 + df['AccZ']**2)
    df['Gyro_Mag'] = np.sqrt(df['GyroX']**2 + df['GyroY']**2 + df['GyroZ']**2)

    feature_cols = ['AccX', 'AccY', 'AccZ', 'GyroX', 'GyroY', 'GyroZ', 'Acc_Mag', 'Gyro_Mag']
    # feature_cols = ['AccX', 'AccY', 'AccZ', 'GyroX', 'GyroY', 'GyroZ']

    df = df.sort_values(by='Timestamp').copy()

    print(f"\n--- Engineering features with window size {window_size} ---")
    # df = df.sort_values(by='Timestamp').copy()


    for col in feature_cols:
        # .rolling() creates the window object.
        # We then apply aggregate functions like .mean(), .std(), etc.
        df[f'{col}_mean_{window_size}'] = df[col].rolling(window=window_size).mean()
        df[f'{col}_std_{window_size}'] = df[col].rolling(window=window_size).std()
        df[f'{col}_max_{window_size}'] = df[col].rolling(window=window_size).max()
        df[f'{col}_min_{window_size}'] = df[col].rolling(window=window_size).min()
        df[f'{col}_q75'] = df[col].rolling(window=window_size).quantile(0.75)
        df[f'{col}_q25'] = df[col].rolling(window=window_size).quantile(0.25)

    df = df.fillna(method='bfill')

    return df



In [23]:

# Define the target and timestamp

WINDOW_SIZE = 150   

df_train = create_time_series(train_df, WINDOW_SIZE)
df_test = create_time_series(test_df, WINDOW_SIZE)

feature_cols = [c for c in df_train.columns if c not in ['Class', 'Timestamp']]
target_col = 'Class'

X_train = df_train[feature_cols]
y_train = df_train[target_col]
X_test = df_test[feature_cols]
y_test = df_test[target_col]

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



--- Engineering features with window size 150 ---

--- Engineering features with window size 150 ---


  df = df.fillna(method='bfill')
  df = df.fillna(method='bfill')


In [24]:

import itertools
import time

from sklearn.calibration import Parallel, delayed


print("Engineering features (Rolling Windows)...")
train_df_eng = create_time_series(train_df, WINDOW_SIZE)
test_df_eng = create_time_series(test_df, WINDOW_SIZE)

# Separate X (Features) and y (Target)
feature_cols = [c for c in train_df_eng.columns if c not in ['Class', 'Timestamp']]
X_train = train_df_eng[feature_cols]
y_train = train_df_eng['Class']
X_test = test_df_eng[feature_cols]
y_test = test_df_eng['Class']

# Scale the data (Standardizing helps Random Forest convergence slightly, critical for others)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 2. Define the Custom Grid Search Parameters ---

print("\nStarting Random Forest Tuning (Multithreaded)")

# Your requested parameter grid
param_grid = {
    'n_estimators': [200, 300, 723],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 20],
    'min_samples_leaf': [1, 4, 734],
    'criterion': ['gini', 'entropy']
}

# Generate all possible combinations of parameters
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

print(f"Testing {len(param_combinations)} candidate models...")
print("-" * 80)

# --- 3. Define the Worker Function ---

def evaluate_candidate(params, X_train, y_train, X_test, y_test):
    """
    Trains a single model and evaluates it on the test set.
    This function will be run in parallel across multiple CPU cores.
    """
    # CRITICAL: n_jobs=1 ensures this specific model uses only 1 core.
    # We rely on the outer joblib loop to parallelize across cores.
    rf = RandomForestClassifier(random_state=42, n_jobs=1, **params)
    
    # Train
    rf.fit(X_train, y_train)
    
    # Predict
    predictions = rf.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    
    # Return dictionary of results
    result = params.copy()
    result['test_accuracy'] = acc
    return result

# --- 4. Run Parallel Execution ---

start_time = time.time()

# n_jobs=-1 uses all available cores. verbose=10 shows progress bar.
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(evaluate_candidate)(params, X_train_scaled, y_train, X_test_scaled, y_test)
    for params in param_combinations
)

end_time = time.time()
print(f"\nGrid Search completed in {end_time - start_time:.2f} seconds.")

# --- 5. Analyze and Report Results ---

# Convert to DataFrame for easy sorting
results_df = pd.DataFrame(results)

# Sort by Test Accuracy (Descending)
best_model = results_df.sort_values(by='test_accuracy', ascending=False).iloc[0]

print("\n" + "="*40)
print("       WINNING MODEL FOUND")
print("="*40)
print(f"Best Test Accuracy: {best_model['test_accuracy']:.4f}")
print("-" * 40)
print("Optimal Parameters:")
print(f"- n_estimators:      {best_model['n_estimators']}")
print(f"- max_depth:         {best_model['max_depth']}")
print(f"- min_samples_split: {best_model['min_samples_split']}")
print(f"- min_samples_leaf:  {best_model['min_samples_leaf']}")
print(f"- criterion:         {best_model['criterion']}")
print("="*40)

Engineering features (Rolling Windows)...

--- Engineering features with window size 150 ---

--- Engineering features with window size 150 ---

Starting Random Forest Tuning (Multithreaded)
Testing 162 candidate models...
--------------------------------------------------------------------------------


  df = df.fillna(method='bfill')
  df = df.fillna(method='bfill')
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 148 out of 162 | elapsed:   18.1s remaining:    1.7s



Grid Search completed in 21.74 seconds.

       WINNING MODEL FOUND
Best Test Accuracy: 0.6443
----------------------------------------
Optimal Parameters:
- n_estimators:      300
- max_depth:         10.0
- min_samples_split: 5
- min_samples_leaf:  734
- criterion:         entropy


[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   21.6s finished


In [16]:

import itertools


print("Starting random Forest Tuning")


rf_model = RandomForestClassifier(random_state=42)


# param_grid = {
#     'n_estimators': [100, 200, 300],        # Number of trees
#     'max_depth': [None, 10, 20, 30],        # Max depth of each tree
#     'min_samples_split': [2, 5, 10],        # Min samples to split a node
#     'min_samples_leaf': [1, 2, 4],          # Min samples at a leaf node
#     'bootstrap': [True, False],             # Method of selecting samples
#     'criterion': ['gini', 'entropy']        # Function to measure split quality
# }

param_grid = {
    'n_estimators': [200, 300,723],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 20],
    'min_samples_leaf': [1, 4, 734],
    'criterion': ['gini', 'entropy']
}

# 


# Generate all possible combinations of parameters
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

print(f"\nStarting Custom Grid Search on {len(param_combinations)} candidates...")
print(f"{'n_est':<10} {'depth':<10} {'split':<10} {'leaf':<10} {'crit':<10} | {'TEST ACCURACY'}")
print("-" * 80)

results = []

# Loop through every combination
for params in param_combinations:
    # A. Configure Model
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, **params)
    
    # B. Train on FULL Training Set
    rf.fit(X_train_scaled, y_train)
    
    # C. Evaluate on FULL Test Set
    predictions = rf.predict(X_test_scaled)
    acc = accuracy_score(y_test, predictions)
    
    # D. Save Results
    result = params.copy()
    result['test_accuracy'] = acc
    results.append(result)
    
    # Print progress row
    print(f"{params['n_estimators']:<10} {str(params['max_depth']):<10} {params['min_samples_split']:<10} {params['min_samples_leaf']:<10} {params['criterion']:<10} | {acc:.4f}")
    # print(f"{params['n_estimators']:<10} {str(params['max_depth']):<10} {params['min_samples_leaf']:<10} | {acc:.4f}")


# --- 3. Analyze Results ---

# Convert to DataFrame for easy sorting
results_df = pd.DataFrame(results)
best_model = results_df.sort_values(by='test_accuracy', ascending=False).iloc[0]

print("\n" + "="*30)
print("     WINNING MODEL FOUND")
print("="*30)
print(f"Best Test Accuracy: {best_model['test_accuracy']:.4f}")
print("Parameters:")
print(f"- n_estimators: {best_model['n_estimators']}")
print(f"- max_depth: {best_model['max_depth']}")
print(f"- min_samples_split: {best_model['min_samples_split']}")
print(f"- min_samples_leaf: {best_model['min_samples_leaf']}")
print(f"- criterion: {best_model['criterion']}")


Starting random Forest Tuning

Starting Custom Grid Search on 162 candidates...
n_est      depth      split      leaf       crit       | TEST ACCURACY
--------------------------------------------------------------------------------
200        10         2          1          gini       | 0.5435
200        10         2          1          entropy    | 0.5691
200        10         2          4          gini       | 0.5496
200        10         2          4          entropy    | 0.5976
200        10         2          734        gini       | 0.6274
200        10         2          734        entropy    | 0.6378
200        10         5          1          gini       | 0.5295
200        10         5          1          entropy    | 0.5519
200        10         5          4          gini       | 0.5496
200        10         5          4          entropy    | 0.5976
200        10         5          734        gini       | 0.6274
200        10         5          734        entropy    | 0.6378


In [20]:


rf_model = RandomForestClassifier(
    n_estimators=723,
    max_depth=None,         # Limits tree complexity
    min_samples_leaf=734,   # Requires at least 4 samples to make a decision
    random_state=42,
    n_jobs=-1
)

print(f"Training Random Forest on {X_train_scaled.shape[1]} features...")
rf_model.fit(X_train_scaled, y_train)


# 4. Evaluate
train_pred = rf_model.predict(X_train_scaled)
test_pred = rf_model.predict(X_test_scaled)

print("\n--- Results ---")
print(f"Training Accuracy: {accuracy_score(y_train, train_pred):.4f}")
print(f"Testing Accuracy:  {accuracy_score(y_test, test_pred):.4f}")

print("\nClassification Report (Test):")
print(classification_report(y_test, test_pred))

# 5. Feature Importance Check
# See which features actually mattered
importances = pd.Series(rf_model.feature_importances_, index=feature_cols)
print("\nTop 10 Most Important Features:")
print(importances.nlargest(10))

Training Random Forest on 40 features...

--- Results ---
Training Accuracy: 0.6084
Testing Accuracy:  0.5798

Classification Report (Test):
              precision    recall  f1-score   support

  AGGRESSIVE       0.55      0.85      0.67       814
      NORMAL       0.00      0.00      0.00       997
        SLOW       0.60      0.86      0.71      1273

    accuracy                           0.58      3084
   macro avg       0.38      0.57      0.46      3084
weighted avg       0.39      0.58      0.47      3084


Top 10 Most Important Features:
AccY_std         0.184077
Acc_Mag_mean     0.153880
Acc_Mag_std      0.119329
AccY_min         0.080352
Acc_Mag_max      0.080292
AccZ_min         0.054612
Gyro_Mag_mean    0.050729
AccY_max         0.041786
AccX_std         0.037195
GyroZ_min        0.033456
dtype: float64


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import random

print("\n--- Starting Random Forest Tuning (Ranking by TEST accuracy - Takes like 8 minutes with current params be warned)... ---")

rf_param_grid_regularized = {
    'n_estimators': [723],
    'max_depth': [None],
    'min_samples_leaf': [734]
}
# 'n_estimators': [50, 100, 200, 300],
#    'max_depth': [3, 5, 8, 13],
#   'min_samples_leaf': [10, 20, 50, 100, 150, 200]
#Best Parameters Based on TEST Accuracy:
#{'max_depth': 3, 'min_samples_leaf': 900, 'n_estimators': 500}
#{'max_depth': 3, 'min_samples_leaf': 734, 'n_estimators': 723}

rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_param_grid_regularized,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=0,
    return_train_score=False
)

rf_grid.fit(X_train_scaled, y_train)

from sklearn.metrics import accuracy_score

# Rank models based only on test-set performance
test_accuracies = []
candidates = rf_grid.cv_results_["params"]

for params in candidates:
    # Train new model with these params on *full training set*
    model = RandomForestClassifier(random_state=42, **params)
    model.fit(X_train_scaled, y_train)

    # Evaluate on TEST set
    y_pred_test = model.predict(X_test_scaled)
    test_acc = accuracy_score(y_test, y_pred_test)

    test_accuracies.append((params, test_acc))

# Sort by best test accuracy
test_accuracies.sort(key=lambda x: x[1], reverse=True)

best_params_test, best_test_accuracy = test_accuracies[0]

print("\nBest Parameters Based on TEST Accuracy:")
print(best_params_test)
print(f"Best TEST Accuracy: {best_test_accuracy:.4f}")

from sklearn.metrics import classification_report

best_model = RandomForestClassifier(random_state=42, **best_params_test)
best_model.fit(X_train_scaled, y_train)

y_pred = best_model.predict(X_test_scaled)

print("\nClassification Report for Best TEST-Selected Model:")
print(classification_report(y_test, y_pred))




--- Starting Random Forest Tuning (Ranking by TEST accuracy - Takes like 8 minutes with current params be warned)... ---

Best Parameters Based on TEST Accuracy:
{'max_depth': None, 'min_samples_leaf': 734, 'n_estimators': 723}
Best TEST Accuracy: 0.5798

Classification Report for Best TEST-Selected Model:
              precision    recall  f1-score   support

  AGGRESSIVE       0.55      0.85      0.67       814
      NORMAL       0.00      0.00      0.00       997
        SLOW       0.60      0.86      0.71      1273

    accuracy                           0.58      3084
   macro avg       0.38      0.57      0.46      3084
weighted avg       0.39      0.58      0.47      3084

