In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

df = pd.read_csv('20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv')

features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10']
target = 'CallerSex'

df = df.dropna(subset=[target])
X = df[features].fillna(df[features].mean())
y = df[target]

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20)
}

rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train) 
best_rf = random_search.best_estimator_

def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    results = {
        'Model': model_name,
        'Train Accuracy': accuracy_score(y_train, y_pred_train),
        'Test Accuracy': accuracy_score(y_test, y_pred_test),
        'Test Precision': precision_score(y_test, y_pred_test, average='weighted', zero_division=0),
        'Test Recall': recall_score(y_test, y_pred_test, average='weighted', zero_division=0),
        'Test F1-Score': f1_score(y_test, y_pred_test, average='weighted', zero_division=0)
    }
    return results

classifiers = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest (Tuned)': best_rf,
    'Support Vector Machine': SVC(random_state=42),
    'Naive Bayes': GaussianNB(),
    'MLP': MLPClassifier(random_state=42, max_iter=500),
    'XGBoost': XGBClassifier(eval_metric='mlogloss', random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}


results_list = []
for name, clf in classifiers.items():
    results = evaluate_model(clf, X_train, y_train, X_test, y_test, name)
    results_list.append(results)

results_df = pd.DataFrame(results_list)
results_df.set_index('Model', inplace=True)

best_model = results_df['Test Accuracy'].idxmax()
best_score = results_df['Test Accuracy'].max()

print(f"Best Classifier: {best_model} (Test Accuracy: {best_score:.4f})\n")
print("Tabulated Results for Classification Models:")
print(results_df)

Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /dev/shm/joblib_memmapping_folder_39414_56c17be61cc64a10a240cfc0052a434e_52e8430af5a44cd382de2ff010556694 for automatic cleanup: unknown resource type folder[0m
Traceback (most recent call last):
  File [35m"/home/easwer/.local/share/mise/installs/python/3.13.7/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /dev/shm/joblib_memmapping_folder_39414_96677c7dd15d40e987d8c03269816eaa_a69a97bb05d94d5aaa2f55c4417b1882 for automa

Best Classifier: Random Forest (Tuned) (Test Accuracy: 0.9468)

Tabulated Results for Classification Models:
                        Train Accuracy  Test Accuracy  Test Precision  \
Model                                                                   
Decision Tree                 1.000000       0.914894        0.914894   
Random Forest (Tuned)         0.946667       0.946809        0.896446   
Support Vector Machine        0.970667       0.946809        0.896446   
Naive Bayes                   0.949333       0.914894        0.929358   
MLP                           0.946667       0.946809        0.896446   
XGBoost                       1.000000       0.946809        0.932239   
CatBoost                      1.000000       0.925532        0.918026   

                        Test Recall  Test F1-Score  
Model                                               
Decision Tree              0.914894       0.914894  
Random Forest (Tuned)      0.946809       0.920939  
Support Vector Machin