In [1]:
!pip install imbalanced-learn
!pip install xgboost
!pip install --upgrade numexpr

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import numpy as np

# Load the dataset
file_path = 'End_dataframe.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Features and target
X = data[['Gender', 'Prior_Donation', 'Lcheek_max', 'Rcheek_max', 'nose_max', 'chin_max', 'below_nose_max', 'HRV_minmax']]
y = data['VVR_Encoded']  # Assuming 'VVR_Encoded' is the target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the XGBoost model and parameter grid for Grid Search
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
param_grid = {
    'n_estimators': [400, 600, 800],  
    'max_depth': [6, 9],  
    'learning_rate': [0.01, 0.1],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0],  
    'reg_lambda': [1, 10],
}

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=0)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best model from Grid Search
best_xgb = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_xgb.predict(X_test)

# Evaluate the model
print("Best Parameters from Grid Search:")
print(grid_search.best_params_)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Overall cross-validation scores (on training set)
cv_scores = cross_val_score(best_xgb, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
cv_mean = np.mean(cv_scores)
cv_variance = np.var(cv_scores)

print("\nOverall Cross-Validation Scores (Training Set):")
print("Scores:", cv_scores)
print("Mean accuracy:", cv_mean)
print("Variance:", cv_variance)

# Cross-validation scores by Gender
gender_labels = {1: 'Male', 2: 'Female'}
print("\nCross-Validation Scores by Gender:")

for gender, gender_name in gender_labels.items():
    # Filter data by gender
    gender_mask = X_train_resampled['Gender'] == gender
    X_gender = X_train_resampled[gender_mask].drop(columns='Gender')  # Drop Gender column for training
    y_gender = y_train_resampled[gender_mask]

    # Perform cross-validation
    cv_scores_gender = cross_val_score(best_xgb, X_gender, y_gender, cv=5, scoring='accuracy')
    print(f"\nGender: {gender_name}")
    print("Scores:", cv_scores_gender)
    print("Mean accuracy:", np.mean(cv_scores_gender))
    print("Variance:", np.var(cv_scores_gender))




Collecting xgboost
  Using cached xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Using cached nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Using cached xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
Using cached nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.3
Best Parameters from Grid Search:
{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 600, 'reg_lambda': 1, 'subsample': 1.0}

Confusion Matrix:
[[64 28]
 [23 14]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.70      0.72        92
           1       0.33      0.38      0.35        37

    accuracy                           0.60       129
   macro avg       0.53      0.54      0.53       129
weig