In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
# Load and preprocess data
data = pd.read_csv('depression_vitals_dataset.csv')

# Feature engineering
X = data.drop('target', axis=1)
y = data['target']

# Create more complex features using polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Apply SMOTE for handling class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [None]:
# Create a more sophisticated pipeline with XGBoost
pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Expanded hyperparameter grid
param_grid_xgb = {
    'clf__n_estimators': [100, 200, 300, 400],
    'clf__max_depth': [3, 4, 5, 6, 7],
    'clf__learning_rate': [0.01, 0.05, 0.1, 0.15],
    'clf__subsample': [0.8, 0.9, 1.0],
    'clf__colsample_bytree': [0.8, 0.9, 1.0],
    'clf__min_child_weight': [1, 3, 5],
    'clf__gamma': [0, 0.1, 0.2]
}

# Use RandomizedSearchCV with more iterations
rand_search_xgb = RandomizedSearchCV(
    pipeline_xgb,
    param_grid_xgb,
    cv=5,
    scoring=['accuracy', 'f1', 'precision', 'recall'],
    refit='f1',
    n_iter=50,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Fit the model
rand_search_xgb.fit(X_train_balanced, y_train_balanced)

In [None]:
# Print results
print("Best parameters:", rand_search_xgb.best_params_)
print("Best cross-validation scores:")
for metric in ['accuracy', 'f1', 'precision', 'recall']:
    print(f"{metric}: {rand_search_xgb.cv_results_[f'mean_test_{metric}'][rand_search_xgb.best_index_]:.3f}")

# Evaluate on test set
y_pred = rand_search_xgb.predict(X_test)
print("\nTest Set Performance:")
print(classification_report(y_test, y_pred))

# Plot feature importance
feature_importance = rand_search_xgb.best_estimator_.named_steps['clf'].feature_importances_
feature_names = [f"feature_{i}" for i in range(len(feature_importance))]

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importance, y=feature_names)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()