In [None]:
!pip install imbalanced-learn

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'End_dataframe.csv' 
data = pd.read_csv(file_path)

# Features and target
X = data[['Gender', 'Prior_Donation', 'Lcheek_max', 'Rcheek_max', 'nose_max', 'chin_max', 'below_nose_max', 'HRV_minmax']]
y = data['VVR_Encoded']  

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the Random Forest model and parameter grid for Grid Search
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.001, 0.01, 0.1, 1]
}

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled.drop(columns='Gender'), y_train_resampled)

# Print the best parameters and the best score
print("\nOptimal Parameters from Grid Search:")
print(grid_search.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.2f}")

# Best model from Grid Search
best_rf = grid_search.best_estimator_

# Make predictions on the test set and training set
y_pred_test = best_rf.predict(X_test.drop(columns='Gender'))
y_pred_train = best_rf.predict(X_train_resampled.drop(columns='Gender'))

# Evaluate the model on the test set
print("\nTest Set Performance:")
print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# Additional metrics for the test set
mcc_test = matthews_corrcoef(y_test, y_pred_test)
print(f"MCC (Test): {mcc_test:.2f}")

# Evaluate the model on the training set
print("\nTraining Set Performance:")
print("\nConfusion Matrix (Training Set):")
print(confusion_matrix(y_train_resampled, y_pred_train))
print("\nClassification Report (Training Set):")
print(classification_report(y_train_resampled, y_pred_train))

# Additional metrics for the training set
mcc_train = matthews_corrcoef(y_train_resampled, y_pred_train)
print(f"MCC (Train): {mcc_train:.2f}")

# Gender-wise analysis for test set
class_labels = {0: 'Low_VVR', 1: 'High_VVR'}
gender_labels = {1: 'Male', 2: 'Female'}

X_test['y_test'] = y_test  
X_test['y_pred'] = y_pred_test  

print("\nTest Set Performance by Gender:")
test_gender_groups = X_test.groupby('Gender')
for gender, group in test_gender_groups:
    gender_name = gender_labels.get(gender, gender)
    true_labels = group['y_test']
    predicted_labels = group['y_pred']
    print(f"\nGender: {gender_name}")
    print(confusion_matrix(true_labels, predicted_labels, labels=[0, 1]))
    print(classification_report(true_labels, predicted_labels, target_names=['Low_VVR', 'High_VVR']))
    print(f"MCC: {matthews_corrcoef(true_labels, predicted_labels):.2f}")

# Gender-wise analysis for training set
print("\nTraining Set Performance by Gender:")
X_train_resampled['y_train'] = y_train_resampled  # Add true labels
X_train_resampled['y_pred'] = y_pred_train  # Add predictions

train_gender_groups = X_train_resampled.groupby('Gender')
for gender, group in train_gender_groups:
    gender_name = gender_labels.get(gender, gender)
    true_labels = group['y_train']
    predicted_labels = group['y_pred']
    print(f"\nGender: {gender_name}")
    print(confusion_matrix(true_labels, predicted_labels, labels=[0, 1]))
    print(classification_report(true_labels, predicted_labels, target_names=['Low_VVR', 'High_VVR']))
    print(f"MCC: {matthews_corrcoef(true_labels, predicted_labels):.2f}")