In [1]:
!pip install imbalanced-learn
!pip install --upgrade numexpr

import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

# Load the dataset
file_path = 'End_dataframe.csv' 
data = pd.read_csv(file_path)

# Features and target
X = data[['Gender', 'Prior_Donation', 'Lcheek_max', 'Rcheek_max', 'nose_max', 'chin_max', 'below_nose_max', 'HRV_minmax']]
y = data['VVR_Encoded']

# Split data into training and test sets
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Reassign the Gender column to the resampled dataset
X_train_resampled['Gender'] = smote.fit_resample(X_train[['Gender']], y_train)[0]['Gender'].values

# Define the Random Forest model and parameter grid for Grid Search
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.001, 0.01, 0.1, 1]
}

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled.drop(columns='Gender'), y_train_resampled)  # Drop Gender for training

# Best model from Grid Search
best_rf = grid_search.best_estimator_

# Cross-validation scores (Overall)
cv_scores = cross_val_score(best_rf, X_train_resampled.drop(columns='Gender'), y_train_resampled, cv=5, scoring='accuracy')
cv_mean = np.mean(cv_scores)
cv_variance = np.var(cv_scores)

# Print overall cross-validation scores
print("\nOverall Cross-Validation Scores (Training Set):")
print("Scores:", cv_scores)
print("Mean accuracy:", cv_mean)
print("Variance:", cv_variance)

# Cross-validation scores by Gender
gender_labels = {1: 'Male', 2: 'Female'}

print("\nCross-Validation Scores by Gender:")

for gender, gender_name in gender_labels.items():
    gender_mask = X_train_resampled['Gender'] == gender
    X_gender = X_train_resampled[gender_mask].drop(columns='Gender')  # Drop Gender column for training
    y_gender = y_train_resampled[gender_mask]

    cv_scores_gender = cross_val_score(best_rf, X_gender, y_gender, cv=5, scoring='accuracy')
    print(f"\nGender: {gender_name}")
    print("Scores:", cv_scores_gender)
    print("Mean accuracy:", np.mean(cv_scores_gender))
    print("Variance:", np.var(cv_scores_gender))




Overall Cross-Validation Scores (Training Set):
Scores: [0.69863014 0.68493151 0.69178082 0.77241379 0.8       ]
Mean accuracy: 0.7295512517713746
Variance: 0.0022347731848416986

Cross-Validation Scores by Gender:

Gender: Male
Scores: [0.69491525 0.69491525 0.71186441 0.86206897 0.70689655]
Mean accuracy: 0.7341320864991234
Variance: 0.004136277758554077

Gender: Female
Scores: [0.71264368 0.75862069 0.73563218 0.72413793 0.8045977 ]
Mean accuracy: 0.7471264367816092
Variance: 0.0010569427929713305
