In [1]:
import src.data.make_dataset as md
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import src.models.models as model
import os
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, IsolationForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

In [2]:
data = md.data_cleaning("cardio_train.csv")

In [3]:
# Copy of helper function
def get_feature_set(base_df, set_type="standard"):
    df = base_df.copy()
    df["BMI"] = df["weight"] / ((df["height"] / 100) ** 2)
    df["map"] = (df["ap_hi"] + 2 * df["ap_lo"]) / 3
    
    if set_type == "minimal":
        return df.drop(["weight", "height", "ap_hi", "ap_lo"], axis=1)
        
    def get_bmi_cat(bmi):
        if bmi < 18.5: return 0
        elif bmi < 25: return 1
        elif bmi < 30: return 2
        elif bmi < 35: return 3
        elif bmi < 40: return 4
        else: return 5
    df["bmi_cat"] = df["BMI"].apply(get_bmi_cat)
    
    def get_bp_cat(row):
        sys = row["ap_hi"]
        dia = row["ap_lo"]
        if sys > 180 or dia > 120: return 4
        elif sys >= 140 or dia >= 90: return 3
        elif sys >= 130 or dia >= 80: return 2
        elif sys >= 120 and dia < 80: return 1
        else: return 0
    df["bp_cat"] = df.apply(get_bp_cat, axis=1)
    df["lifestyle_risk"] = df["smoke"] + df["alco"]
    
    risk_bmi = (df["BMI"] >= 30).astype(int)
    risk_bp = ((df["ap_hi"] >= 130) | (df["ap_lo"] >= 85)).astype(int)
    risk_gluc = (df["gluc"] > 1).astype(int)
    risk_chol = (df["cholesterol"] > 1).astype(int)
    df["metabolic_score"] = risk_bmi + risk_bp + risk_gluc + risk_chol
    
    if set_type == "standard":
        return df.drop(["weight", "height", "ap_hi", "ap_lo"], axis=1)
        
    if set_type == "extended":
        df['age_sq'] = df['age'] ** 2
        df['bmi_sq'] = df['BMI'] ** 2
        df['map_sq'] = df['map'] ** 2
        df['age_chol'] = df['age'] * df['cholesterol']
        df['map_bmi'] = df['map'] * df['BMI']
        return df.drop(["weight", "height", "ap_hi", "ap_lo"], axis=1)
    return df

In [4]:
# --- 4. THE GRAND FINALE (XGBoost + Extended + Intense Tuning) ---

print("Starting FINAL OPTIMIZATION... Hold on tight! ðŸš€")

# 1. Prepare Data
df_final = get_feature_set(data, set_type="extended")
df_final = df_final.replace([np.inf, -np.inf], np.nan).dropna()

X = df_final.drop("cardio", axis=1)
y = df_final["cardio"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. Define Parameter Grid
param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [4, 5, 6, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2, 0.5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

xgb = XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=60,  # 60 Random combinations
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("\n--- BEST PARAMETERS FOUND ---")
print(random_search.best_params_)

# 3. Final Evaluation
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
final_acc = accuracy_score(y_test, y_pred)
train_acc = accuracy_score(y_train, best_model.predict(X_train))

print(f"\nFINAL TEST ACCURACY: {final_acc*100:.2f}%")
print(f"TRAIN ACCURACY:      {train_acc*100:.2f}%")
print(f"Gap:                 {train_acc*100 - final_acc*100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Starting FINAL OPTIMIZATION... Hold on tight! ðŸš€
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



--- BEST PARAMETERS FOUND ---
{'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.8}

FINAL TEST ACCURACY: 73.17%
TRAIN ACCURACY:      73.72%
Gap:                 0.55%

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.76      0.74      6990
           1       0.75      0.70      0.72      6978

    accuracy                           0.73     13968
   macro avg       0.73      0.73      0.73     13968
weighted avg       0.73      0.73      0.73     13968

