In [1]:
!pip install catboost



Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint, uniform
import warnings

# --- Importing Classifiers ---
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# --- Configuration ---
# Suppress future warnings for a cleaner output
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- 1. Data Loading and Preprocessing ---
print("--- Step 1: Loading and Preprocessing Data ---")
try:
    # Load the dataset from the specified path.
    # low_memory=False is added to address the DtypeWarning, though the cleaning step below is the primary fix.
    df = pd.read_csv("features_filtered.csv", low_memory=False)
except FileNotFoundError:
    print("Error: 'features_filtered.csv' not found. Please ensure the file is in the correct directory.")
    exit()

# Separate features (X) from the target variable (y).
# Dropping metadata columns that are not predictive features.
X = df.drop(columns=['file', 'run', 'label', 'onset_s'])
y = df['label']

# --- Data Cleaning Step ---
# The DtypeWarning and ValueError indicate that some columns may contain non-numeric
# data (like truncated numbers). We will iterate through all feature columns,
# attempt to convert them to a numeric type, and replace any values that fail
# conversion with the column's median.
print("Cleaning non-numeric data from feature set...")
for col in X.columns:
    # 'coerce' will turn any non-numeric values into NaN (Not a Number)
    X[col] = pd.to_numeric(X[col], errors='coerce')
    # Check if any NaNs were created during coercion
    if X[col].isnull().any():
        # Calculate the median of the column, ignoring NaN values
        median_val = X[col].median()
        # Fill any NaN values with the median. `inplace=True` modifies the DataFrame directly.
        X[col].fillna(median_val, inplace=True)
print("Data cleaning complete.")


# Encode the categorical target variable 'label' into numerical format.
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data into training and testing sets (70% train, 30% test).
# 'stratify=y_encoded' ensures that the class distribution is the same in both train and test sets,
# which is crucial for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Scale the features using StandardScaler. This standardizes features by removing the mean
# and scaling to unit variance. It's essential for distance-based algorithms like SVM and MLP.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Loading and Preprocessing Complete.\n")


# --- 2. Model and Hyperparameter Space Definition ---
print("--- Step 2: Defining Models and Hyperparameter Search Spaces ---")

# A dictionary to hold the models and their corresponding hyperparameter distributions for tuning.
models_to_tune = {}

# 2.1 Support Vector Machine (SVM)
svm_params = {'C': uniform(0.1, 10), 'gamma': uniform(0.001, 0.1), 'kernel': ['rbf']}
models_to_tune['SVM'] = (SVC(probability=True, random_state=42), svm_params)

# 2.2 Decision Tree
dt_params = {'max_depth': randint(5, 50), 'min_samples_split': randint(2, 20), 'min_samples_leaf': randint(1, 20), 'criterion': ['gini', 'entropy']}
models_to_tune['Decision Tree'] = (DecisionTreeClassifier(random_state=42), dt_params)

# 2.3 Random Forest
rf_params = {'n_estimators': randint(100, 500), 'max_depth': randint(10, 100), 'min_samples_split': randint(2, 20), 'min_samples_leaf': randint(1, 20)}
models_to_tune['Random Forest'] = (RandomForestClassifier(random_state=42, n_jobs=-1), rf_params)

# 2.4 AdaBoost
ada_params = {'n_estimators': randint(50, 500), 'learning_rate': uniform(0.01, 1.0)}
models_to_tune['AdaBoost'] = (AdaBoostClassifier(random_state=42), ada_params)

# 2.5 CatBoost
cat_params = {'iterations': randint(100, 500), 'learning_rate': uniform(0.01, 0.3), 'depth': randint(4, 10), 'l2_leaf_reg': uniform(1, 10)}
models_to_tune['CatBoost'] = (CatBoostClassifier(random_state=42, silent=True), cat_params)

# 2.6 XGBoost
xgb_params = {'n_estimators': randint(100, 500), 'max_depth': randint(3, 10), 'learning_rate': uniform(0.01, 0.3)}
models_to_tune['XGBoost'] = (XGBClassifier(eval_metric='mlogloss', random_state=42), xgb_params)

# 2.7 Gaussian Naive Bayes
# Naive Bayes has fewer hyperparameters to tune. 'var_smoothing' is a stability parameter.
gnb_params = {'var_smoothing': uniform(1e-10, 1e-7)}
models_to_tune['Gaussian NB'] = (GaussianNB(), gnb_params)

# 2.8 MLP Classifier (Neural Network)
# Added early_stopping for efficiency: training stops when validation score is not improving.
mlp_params = {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['tanh', 'relu'], 'alpha': uniform(0.0001, 0.01), 'learning_rate_init': uniform(0.001, 0.1)}
models_to_tune['MLP Classifier'] = (MLPClassifier(random_state=42, max_iter=500, early_stopping=True, n_iter_no_change=10), mlp_params)

print("Model definitions are ready.\n")


# --- 3. Hyperparameter Tuning using RandomizedSearchCV ---
print("--- Step 3: Performing Hyperparameter Tuning ---")

best_models = {}
for name, (model, params) in models_to_tune.items():
    print(f"Tuning {name}...")
    # RandomizedSearchCV is generally faster than GridSearchCV and explores a wider range of hyperparameters.
    # n_iter: Number of parameter settings that are sampled. Trades off runtime vs. quality of the solution.
    # cv: Number of cross-validation folds. 3 is faster, 5 is more robust.
    # n_jobs=-1: Use all available CPU cores to speed up the process.
    random_search = RandomizedSearchCV(
        model,
        param_distributions=params,
        n_iter=20, # Can be increased for more thorough search, or decreased for speed
        cv=3,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train_scaled, y_train)
    best_models[name] = random_search.best_estimator_
    print(f"✓ {name} tuning complete. Best score: {random_search.best_score_:.4f}\n")


# --- 4. Model Evaluation ---
print("--- Step 4: Evaluating All Tuned Models ---")

results = {}
for name, model in best_models.items():
    # Predictions on the training set to check for overfitting
    y_train_pred = model.predict(X_train_scaled)

    # Predictions on the unseen test set to evaluate generalization
    y_test_pred = model.predict(X_test_scaled)

    # Store metrics for the training set
    results[f"{name}_Train"] = {
        'Accuracy': accuracy_score(y_train, y_train_pred),
        'Precision': precision_score(y_train, y_train_pred, average='macro', zero_division=0),
        'Recall': recall_score(y_train, y_train_pred, average='macro', zero_division=0),
        'F1-score': f1_score(y_train, y_train_pred, average='macro', zero_division=0)
    }
    # Store metrics for the test set
    results[f"{name}_Test"] = {
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred, average='macro', zero_division=0),
        'Recall': recall_score(y_test, y_test_pred, average='macro', zero_division=0),
        'F1-score': f1_score(y_test, y_test_pred, average='macro', zero_division=0)
    }

print("Evaluation complete.\n")

# --- 5. Final Performance Report ---
# Create a pandas DataFrame from the results dictionary for clean tabulation.
results_df = pd.DataFrame.from_dict(results, orient='index')

# Create a MultiIndex for better organization, separating Model from Dataset (Train/Test).
results_df.index = pd.MultiIndex.from_tuples(
    [(name.split('_')[0], name.split('_')[1]) for name in results_df.index],
    names=['Model', 'Dataset']
)

print("--- COMPREHENSIVE PERFORMANCE REPORT ---")
print("The table below compares the performance of each model on both the training and test datasets.")
print("Observations to make:")
print("1. High Train score and low Test score indicates overfitting.")
print("2. Similar scores on both sets suggest a well-generalized model.")
print("------------------------------------------\n")
# Display the final results, rounded to 3 decimal places for readability.
print(results_df.round(3))



--- Step 1: Loading and Preprocessing Data ---
Cleaning non-numeric data from feature set...
Data cleaning complete.
Data Loading and Preprocessing Complete.

--- Step 2: Defining Models and Hyperparameter Search Spaces ---
Model definitions are ready.

--- Step 3: Performing Hyperparameter Tuning ---
Tuning SVM...
✓ SVM tuning complete. Best score: 0.5850

Tuning Decision Tree...
✓ Decision Tree tuning complete. Best score: 0.5126

Tuning Random Forest...
✓ Random Forest tuning complete. Best score: 0.5493

Tuning AdaBoost...
✓ AdaBoost tuning complete. Best score: 0.5310

Tuning CatBoost...
