In [1]:
!pip install imbalanced-learn
!pip install xgboost
!pip install --upgrade numexpr

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef
from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# Load the dataset
file_path = 'End_dataframe.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Features and target
X = data[['Gender', 'Prior_Donation', 'Lcheek_max', 'Rcheek_max', 'nose_max', 'chin_max', 'below_nose_max', 'HRV_minmax']]
y = data['VVR_Encoded']  # Assuming 'VVR_Encoded' is the target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the XGBoost model and parameter grid for Grid Search
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
param_grid = {
    'n_estimators': [400, 600, 800],  
    'max_depth': [6, 9],  
    'learning_rate': [0.01, 0.1],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0],  
    'reg_lambda': [1, 10],
}

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=0)
grid_search.fit(X_train_resampled.drop(columns='Gender'), y_train_resampled)

# Best model from Grid Search
best_xgb = grid_search.best_estimator_

# Make predictions on the test set and training set
y_pred_test = best_xgb.predict(X_test.drop(columns='Gender'))
y_pred_train = best_xgb.predict(X_train_resampled.drop(columns='Gender'))

# Evaluate the model on the test set
print("Best Parameters from Grid Search:")
print(grid_search.best_params_)

# Balanced accuracy and per-class balanced accuracy
def per_class_balanced_accuracy(cm):
    per_class = cm.diagonal() / cm.sum(axis=1)
    return per_class

# Test set performance
test_cm = confusion_matrix(y_test, y_pred_test)
test_balanced_acc = balanced_accuracy_score(y_test, y_pred_test)
test_per_class_balanced_acc = per_class_balanced_accuracy(test_cm)

print("\nTest Set Performance:")
print(f"Balanced Accuracy (Test Set): {test_balanced_acc:.2f}")
for i, acc in enumerate(test_per_class_balanced_acc):
    print(f"Class {i} Balanced Accuracy: {acc:.2f}")
print("\nConfusion Matrix (Test Set):")
print(test_cm)
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# Training set performance
train_cm = confusion_matrix(y_train_resampled, y_pred_train)
train_balanced_acc = balanced_accuracy_score(y_train_resampled, y_pred_train)
train_per_class_balanced_acc = per_class_balanced_accuracy(train_cm)

print("\nTraining Set Performance:")
print(f"Balanced Accuracy (Training Set): {train_balanced_acc:.2f}")
for i, acc in enumerate(train_per_class_balanced_acc):
    print(f"Class {i} Balanced Accuracy: {acc:.2f}")
print("\nConfusion Matrix (Training Set):")
print(train_cm)
print("\nClassification Report (Training Set):")
print(classification_report(y_train_resampled, y_pred_train))


Best Parameters from Grid Search:
{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 800, 'reg_lambda': 1, 'subsample': 0.8}

Test Set Performance:
Balanced Accuracy (Test Set): 0.56
Class 0 Balanced Accuracy: 0.75
Class 1 Balanced Accuracy: 0.38

Confusion Matrix (Test Set):
[[69 23]
 [23 14]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.75      0.75      0.75        92
           1       0.38      0.38      0.38        37

    accuracy                           0.64       129
   macro avg       0.56      0.56      0.56       129
weighted avg       0.64      0.64      0.64       129


Training Set Performance:
Balanced Accuracy (Training Set): 1.00
Class 0 Balanced Accuracy: 1.00
Class 1 Balanced Accuracy: 1.00

Confusion Matrix (Training Set):
[[364   0]
 [  0 364]]

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       1.00      1.0