# Model training and evaluation
---
---

## Define target y (risk_category) and features X.

In [None]:
import pandas as pd


df = pd.read_csv("../data/clustered_data.csv")

y = df["Cluster"]
X = df.drop(columns=["Cluster", "risk_category"])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.628456,0.876830,-0.039615,0.684233,0.669818,0.184159,0.480665,1.430041
1,-0.853714,-1.210994,-0.555560,0.027134,-0.826862,-0.862642,-0.366041,-0.191010
2,1.221324,2.036733,-0.727541,0.070940,2.586890,-1.356134,0.618715,-0.105691
3,-0.853714,-1.078434,-0.555560,-0.629966,-0.595757,-0.638328,-0.930511,-1.044194
4,-1.150148,0.512290,-2.791321,0.684233,0.218613,1.604818,5.576238,-0.020373
...,...,...,...,...,...,...,...,...
753,1.814192,-0.680753,0.304349,2.107949,0.350673,0.079479,-0.918240,2.539181
754,-0.557280,0.015188,-0.211596,-0.191900,0.718240,0.662697,-0.399786,-0.532284
755,0.332022,-0.017952,-0.039615,-0.629966,-0.397667,-0.922460,-0.691225,-0.276328
756,-0.853714,0.147749,-1.071505,-0.892806,0.359477,-0.339242,-0.372176,1.174085


## Split data (train/test with train_test_split).

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=474
)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
387,0.035588,1.208231,0.476330,0.355683,0.260432,0.079479,1.020593,-0.191010
34,2.110626,0.545430,0.304349,0.443297,0.040332,0.124342,-0.154364,0.150264
135,0.332022,-0.548193,-0.039615,0.027134,1.946399,0.677651,-0.955053,-0.446965
396,1.517758,-1.641815,0.476330,-0.410933,-1.088781,-0.114927,-0.583852,0.406219
124,-0.853714,-0.150512,1.336239,-0.520449,-0.034502,0.318748,-0.206516,0.576856
...,...,...,...,...,...,...,...,...
172,0.628456,-1.210994,0.476330,-0.104286,-1.038158,-0.174744,-0.270940,0.747493
687,-1.150148,-0.747033,-1.346675,-1.374679,-0.771837,-1.101911,-0.666682,-0.958876
243,1.517758,-0.515053,-1.759431,-0.257610,-0.318431,-0.174744,-0.277075,0.747493
154,-0.557280,-1.111574,0.132367,-1.068032,-1.046962,-0.503739,-0.740309,-0.958876


In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print("Before:", y_train.value_counts())
print("After:", y_train_res.value_counts())


Before: Cluster
1    347
0    259
Name: count, dtype: int64
After: Cluster
0    347
1    347
Name: count, dtype: int64


## Train models: Random Forest, SVM, Logistic Regression, Decision Tree, XGBoost.

In [25]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train_res, y_train_res)           
    y_pred = model.predict(X_test)                
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

print(results)


{'Random Forest': 0.9276315789473685, 'SVM': 0.9539473684210527, 'Gradient Boosting': 0.9276315789473685, 'Decision Tree': 0.8618421052631579, 'Logistic Regression': 1.0, 'XGBoost': 0.9276315789473685}
