In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from scipy.stats import randint, uniform  # Utilities for defining hyperparameter search spaces


# Loading and preprocess data
df = pd.read_csv(r'/content/final_min_features_filtered.csv')  # Load dataset
X = df.drop(columns=['file', 'run', 'label', 'onset_s'])       # Drop non-feature columns
y = df['label']

# Encode string labels into numeric values
#label encoder is ued when you target labels y are categorical string or non numeric
le = LabelEncoder()
y_encoded = le.fit_transform(y)#fit mapping unique labels  and #transform replaces each label y with its corresponding integer

# Train-test split with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)#y_encoded is target variable(encoded labels) and stratify to keep them 30% for test and other for training same in original dataset

# Standardize features (important for models like SVM, MLP)
#age in year,weight inkgs so we need standardiztion so keep mean =0 and standard deviation=1
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Preprocessing Complete.\n")


# Define models and hyperparameter spaces
models_to_tune = {}
#allows to call pedict.probability
models_to_tune['SVM'] = (
    SVC(probability=True, random_state=42),
    {'C': uniform(0.1, 10), 'gamma': uniform(0.001, 0.1)}
)#c is a randomizedsearchcV,controls trade off simple decision boundary and training points

models_to_tune['Decision Tree'] = (
    DecisionTreeClassifier(random_state=42),
    {'max_depth': randint(5, 50)}#for decision tree  and random 5 to 49
)

models_to_tune['Random Forest'] = (
    RandomForestClassifier(random_state=42, n_jobs=-1),#n_jobs=-1 use all available cores
    {'n_estimators': randint(100, 200), 'max_depth': randint(10, 50)}#trained in parallel
    #random forest uses more decision tress ,more tree more accurate
)
#train decision tree one after other and new focus on old learned mistake
#all learned are combined to make prediction
models_to_tune['AdaBoost'] = (
    AdaBoostClassifier(random_state=42),
    {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 1.0)}
)#choose random as small lead to more estmators and larger lead to overfitiing

models_to_tune['CatBoost'] = (
    CatBoostClassifier(random_state=42, silent=True),#silent=true for reducing execution time as it give sbest score,loss,more content
    {'iterations': randint(100, 200), 'learning_rate': uniform(0.01, 0.3)}
)

models_to_tune['XGBoost'] = (#lower mlogless better model and it is multi class logarithmic loss,used for classification
    XGBClassifier(eval_metric='mlogloss', random_state=42, use_label_encoder=False),#use_label to avoid warning
    {'n_estimators': randint(100, 200), 'max_depth': randint(3, 10)}
)#It penalizes incorrect predictions, especially if the model is very confident but wrong.

models_to_tune['Gaussian NB'] = (
    GaussianNB(),#naive bayes is a probabilistic classifier based on bayes theorem
    {'var_smoothing': uniform(1e-10, 1e-7)}#a fetaure can have zero variance divison by zero to avoid variance smoothing
)

models_to_tune['MLP Classifier'] = (
    MLPClassifier(random_state=42, max_iter=500),
    {'hidden_layer_sizes': [(50,), (100,)], 'alpha': uniform(0.0001, 0.01)}#reduces overftting not grow weights too high
)
#one hidden layer 50 neurons ,one hidden layer with 100 neurons

# Hyperparameter tuning with RandomizedSearchCV
best_models = {}
for name, (model, params) in models_to_tune.items():
    print(f"Rapidly tuning {name}...")
    random_search = RandomizedSearchCV(
        model,
        param_distributions=params,
        n_iter=5,#only test 5 random combinations of parameters
        cv=2,#training set split into 2 parts
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train_scaled, y_train)
    best_models[name] = random_search.best_estimator_
    print(f"{name} tuning complete.")


#  Evaluate tuned models on train & test sets
print("\nStep 3: Evaluating All Tuned Models...")
results = {}

for name, model in best_models.items():
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    results[f"{name}_Train"] = {
        'Accuracy': accuracy_score(y_train, y_train_pred),
        'F1-score': f1_score(y_train, y_train_pred, average='macro')#macro is used for suppose A class 95%,B CLASS 5% ALL CLASS A
    }
    results[f"{name}_Test"] = {
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'F1-score': f1_score(y_test, y_test_pred, average='macro')
    }


# Organize results in a clean table
results_df = pd.DataFrame.from_dict(results, orient='index')#to convert results into a pandas dataframe
results_df.index = pd.MultiIndex.from_tuples(
    [(name.split('_')[0], name.split('_')[1]) for name in results_df.index],#to split svm_train into "svm","train"
    names=['Model', 'Dataset']
)

print("\nRAPID PERFORMANCE REPORT")
print(results_df.round(3))


Data Preprocessing Complete.

Rapidly tuning SVM...
SVM tuning complete.
Rapidly tuning Decision Tree...
Decision Tree tuning complete.
Rapidly tuning Random Forest...
Random Forest tuning complete.
Rapidly tuning AdaBoost...
AdaBoost tuning complete.
Rapidly tuning CatBoost...




CatBoost tuning complete.
Rapidly tuning XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost tuning complete.
Rapidly tuning Gaussian NB...
Gaussian NB tuning complete.
Rapidly tuning MLP Classifier...
MLP Classifier tuning complete.

Step 3: Evaluating All Tuned Models...

RAPID PERFORMANCE REPORT
                        Accuracy  F1-score
Model          Dataset                    
SVM            Train       0.964     0.951
               Test        0.544     0.340
Decision Tree  Train       0.849     0.812
               Test        0.468     0.305
Random Forest  Train       1.000     1.000
               Test        0.553     0.326
AdaBoost       Train       0.539     0.253
               Test        0.523     0.223
CatBoost       Train       0.886     0.874
               Test        0.556     0.345
XGBoost        Train       1.000     1.000
               Test        0.566     0.386
Gaussian NB    Train       0.238     0.194
               Test        0.231     0.196
MLP Classifier Train       0.862     0.816
               Test        0.547     0.423
