In [1]:
# Import necessary libraries
import pickle
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut, train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
import time

In [2]:
# Load the dataset
df1 = pickle.load(open("../exercises/features.pkl", "rb"))

In [3]:
# Extract features and target
X = df1.drop(columns=['activity', 'participantId'])
y = df1['activity']

# Split the dataset into train and test sets
train, test = train_test_split(df1, test_size=0.3, stratify=df1[['activity', 'participantId']])
groups = train['participantId']

In [4]:
# Extract train features and target
X_train = train.drop(columns=['activity', 'participantId'])
y_train = train['activity']

# Extract test features and target
X_test = test.drop(columns=['activity', 'participantId'])
y_test = test['activity']

In [5]:
# Standardize features
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert y_train to numpy array
y_train = y_train.values

In [6]:
# Leave-One-Group-Out cross-validation
logo = LeaveOneGroupOut()
logo_splits = list(logo.split(X_train, y_train, groups=train['participantId']))

In [None]:
# Define classifiers to compare
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

# Store results for each classifier
results = {name: {'accuracy': [], 'precision': [], 'recall': [], 'f1': []} for name in classifiers}

for train_index, test_index in tqdm(logo_splits, desc="Cross-validation splits"):
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
    
    for name, clf in tqdm(classifiers.items(), desc="Classifiers", leave=False):
        start_time = time.time()
        clf.fit(X_train_cv, y_train_cv)
        y_pred = clf.predict(X_test_cv)
        
        results[name]['accuracy'].append(accuracy_score(y_test_cv, y_pred))
        results[name]['precision'].append(precision_score(y_test_cv, y_pred, average='weighted', zero_division=1))
        results[name]['recall'].append(recall_score(y_test_cv, y_pred, average='weighted', zero_division=1))
        results[name]['f1'].append(f1_score(y_test_cv, y_pred, average='weighted', zero_division=1))
        
# Print average results
for name, metrics in results.items():
    print(f"{name} results:")
    print(f"Accuracy: {sum(metrics['accuracy'])/len(metrics['accuracy'])}")
    print(f"Precision: {sum(metrics['precision'])/len(metrics['precision'])}")
    print(f"Recall: {sum(metrics['recall'])/len(metrics['recall'])}")
    print(f"F1-Score: {sum(metrics['f1'])/len(metrics['f1'])}")
    print("\n")

Cross-validation splits:   0%|          | 0/21 [00:00<?, ?it/s]
Classifiers:   0%|          | 0/5 [00:00<?, ?it/s][A
Classifiers:  20%|██        | 1/5 [00:06<00:27,  6.90s/it][A
Classifiers:  40%|████      | 2/5 [00:49<01:22, 27.64s/it][A
Classifiers:  60%|██████    | 3/5 [01:03<00:43, 21.58s/it][A
Classifiers:  80%|████████  | 4/5 [01:25<00:21, 21.81s/it][A
Classifiers: 100%|██████████| 5/5 [01:26<00:00, 14.45s/it][A
Cross-validation splits:   5%|▍         | 1/21 [01:27<29:00, 87.00s/it]
Classifiers:   0%|          | 0/5 [00:00<?, ?it/s][A
Classifiers:  20%|██        | 1/5 [00:08<00:32,  8.23s/it][A
Classifiers:  40%|████      | 2/5 [00:49<01:23, 27.80s/it][A
Classifiers:  60%|██████    | 3/5 [01:04<00:43, 22.00s/it][A
Classifiers:  80%|████████  | 4/5 [01:07<00:14, 14.23s/it][A
Classifiers: 100%|██████████| 5/5 [01:08<00:00,  9.68s/it][A
Cross-validation splits:  10%|▉         | 2/21 [02:35<24:09, 76.27s/it]
Classifiers:   0%|          | 0/5 [00:00<?, ?it/s][A
Classifier

In [None]:
# Perform 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

for name, clf in tqdm(classifiers.items(), desc='Classifiers'):
    accuracy = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    precision = cross_val_score(clf, X_train, y_train, cv=kf, scoring='precision_weighted')
    recall = cross_val_score(clf, X_train, y_train, cv=kf, scoring='recall_weighted')
    f1 = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_weighted')
    
    print(f"{name} 10-Fold CV results:")
    print(f"Accuracy: {accuracy.mean()}")
    print(f"Precision: {precision.mean()}")
    print(f"Recall: {recall.mean()}")
    print(f"F1-Score: {f1.mean()}")
    print("\n")

In [None]:
# Feature Subset Selection
best_clf = RandomForestClassifier()
comparison_clf = DecisionTreeClassifier()
best_clf.fit(X_train, y_train)
feature_importances = best_clf.feature_importances_

# Select top 10 features 
top_n_features = sorted(range(len(feature_importances)), key=lambda i: feature_importances[i], reverse=True)[:10]
X_train_subset = X_train[:, top_n_features]
X_test_subset = X_test[:, top_n_features]

# Re-run the best model with the subset of features
accuracy = cross_val_score(best_clf, X_train_subset, y_train, cv=kf, scoring='accuracy')
precision = cross_val_score(best_clf, X_train_subset, y_train, cv=kf, scoring='precision_weighted')
recall = cross_val_score(best_clf, X_train_subset, y_train, cv=kf, scoring='recall_weighted')
f1 = cross_val_score(best_clf, X_train_subset, y_train, cv=kf, scoring='f1_weighted')

# Print results for the best classifier with top 10 features
print(f"Random Forest with Top 10 Features 10-Fold CV results:")
print(f"Accuracy: {accuracy.mean()}")
print(f"Precision: {precision.mean()}")
print(f"Recall: {recall.mean()}")
print(f"F1-Score: {f1.mean()}")
print("\n")

# Re-run the comparison model with the subset of features
accuracy_dt = cross_val_score(comparison_clf , X_train_subset, y_train, cv=kf, scoring='accuracy')
precision_dt = cross_val_score(comparison_clf , X_train_subset, y_train, cv=kf, scoring='precision_weighted')
recall_dt = cross_val_score(comparison_clf , X_train_subset, y_train, cv=kf, scoring='recall_weighted')
f1_dt = cross_val_score(comparison_clf , X_train_subset, y_train, cv=kf, scoring='f1_weighted')

# Print results for the comparison classifier with top 10 features
print(f"Random Forest with Top 10 Features 10-Fold CV results:")
print(f"Accuracy: {accuracy_dt.mean()}")
print(f"Precision: {precision_dt.mean()}")
print(f"Recall: {recall_dt.mean()}")
print(f"F1-Score: {f1_dt.mean()}")
print("\n")

In [None]:
# Recursive Feature Elimination (RFE)
# Using RFE with models that support it
for name, clf in tqdm(classifiers.items(), desc='RFE Classifiers'):
    if hasattr(clf, 'coef_') or hasattr(clf, 'feature_importances_'):
        rfe = RFE(estimator=clf, n_features_to_select=10)
        rfe.fit(X_train, y_train)
        
        X_train_rfe = rfe.transform(X_train)
        X_test_rfe = rfe.transform(X_test)
        
        accuracy = cross_val_score(clf, X_train_rfe, y_train, cv=kf, scoring='accuracy')
        precision = cross_val_score(clf, X_train_rfe, y_train, cv=kf, scoring='precision_weighted')
        recall = cross_val_score(clf, X_train_rfe, y_train, cv=kf, scoring='recall_weighted')
        f1 = cross_val_score(clf, X_train_rfe, y_train, cv=kf, scoring='f1_weighted')
        
        print(f"{name} with RFE 10-Fold CV results:")
        print(f"Accuracy: {accuracy.mean()}")
        print(f"Precision: {precision.mean()}")
        print(f"Recall: {recall.mean()}")
        print(f"F1-Score: {f1.mean()}")
        print("\n")