# 🧠 Model Training 

## 📦 Modules included 

In [1]:
# Data handling and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## 📈 Statistical Tools and Feature Scaling

In [2]:
from scipy import stats
from sklearn.preprocessing import StandardScaler

## 🔀 Train-Test Split and Cross-Validation

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score

## 📊 Model Evaluation Metrics

In [4]:
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_curve, auc
)
from sklearn import metrics


## 🤖 Machine Learning Models

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

## 📂 Loading The Dataset 

In [6]:
data = pd.read_csv("../data/processed/processed_data.csv")

data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,-0.418315,2,0.443881,-0.847599,-1.016518,-0.143537,1,1,0,0,1,0
1,1,0.320429,1,-1.022764,0.759808,0.815668,0.940917,3,1,0,0,1,1
2,2,-0.270566,1,0.07722,-0.707824,0.204939,-1.22799,3,1,0,0,0,1
3,3,-0.713813,2,0.566101,0.550146,1.426396,2.025371,1,1,0,0,1,1
4,4,-0.861561,1,-1.022764,-1.266923,-1.627246,-2.312444,1,1,0,0,0,0


In [8]:
# Drop the 'id' column
X = data.drop(columns=['id', 'cardio'])
y = data['cardio']

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save splits
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

print("✅ Train-test splits saved to processed data folder!")

✅ Train-test splits saved to processed data folder!


## Logistic Regression 

In [11]:
# Initialize Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions on training and test sets
y_pred_train_logreg = logreg_model.predict(X_train)
y_pred_test_logreg = logreg_model.predict(X_test)

# Compute training and test accuracies
train_accuracy_logreg = accuracy_score(y_train, y_pred_train_logreg)
test_accuracy_logreg = accuracy_score(y_test, y_pred_test_logreg)

# Get classification report for detailed metrices 
class_report_logreg = classification_report(y_test, y_pred_test_logreg)

# Display results
print(f"Logistic Regression Training Accuracy: {train_accuracy_logreg:.4f}")
print(f"Logistic Regression Test Accuracy: {test_accuracy_logreg:.4f}")
print(f"Logistic Regression Classification Report:\n{class_report_logreg}")

Logistic Regression Training Accuracy: 0.7295
Logistic Regression Test Accuracy: 0.7202
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.78      0.74      6988
           1       0.74      0.66      0.70      6696

    accuracy                           0.72     13684
   macro avg       0.72      0.72      0.72     13684
weighted avg       0.72      0.72      0.72     13684



## Random Forest 

In [12]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on training and test sets
y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)

# Compute training and test accuracies
train_accuracy_rf = accuracy_score(y_train, y_pred_train_rf)
test_accuracy_rf = accuracy_score(y_test, y_pred_test_rf)

# Get classification report for detailed metrics
class_report_rf = classification_report(y_test, y_pred_test_rf)

# Display results
print(f"Random Forest Training Accuracy: {train_accuracy_rf:.4f}")
print(f"Random Forest Test Accuracy: {test_accuracy_rf:.4f}")
print(f"Random Forest Classification Report:\n{class_report_rf}")

Random Forest Training Accuracy: 0.9797
Random Forest Test Accuracy: 0.7056
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.72      0.71      6988
           1       0.70      0.69      0.70      6696

    accuracy                           0.71     13684
   macro avg       0.71      0.71      0.71     13684
weighted avg       0.71      0.71      0.71     13684



## KNeighborsClassifier (KNN)

In [13]:
# Initialize KNN model
knn_model = KNeighborsClassifier()

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on training and test sets
y_pred_train_knn = knn_model.predict(X_train)
y_pred_test_knn = knn_model.predict(X_test)

# Compute training and test accuracies
train_accuracy_knn = accuracy_score(y_train, y_pred_train_knn)
test_accuracy_knn = accuracy_score(y_test, y_pred_test_knn)

# Get classification report for detailed metrics
class_report_knn = classification_report(y_test, y_pred_test_knn)

# Display results
print(f"K-Nearest Neighbors Training Accuracy: {train_accuracy_knn:.4f}")
print(f"K-Nearest Neighbors Test Accuracy: {test_accuracy_knn:.4f}")
print(f"K-Nearest Neighbors Classification Report:\n{class_report_knn}")

K-Nearest Neighbors Training Accuracy: 0.7834
K-Nearest Neighbors Test Accuracy: 0.6950
K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.71      0.70      6988
           1       0.69      0.68      0.69      6696

    accuracy                           0.70     13684
   macro avg       0.69      0.69      0.69     13684
weighted avg       0.69      0.70      0.69     13684



## K-Means

In [18]:
from scipy.stats import mode
import numpy as np
from sklearn.cluster import KMeans

# Fit KMeans on training features (usually after scaling!)
kmeans = KMeans(n_clusters=2, random_state=42)
train_clusters = kmeans.fit_predict(X_train)
test_clusters = kmeans.predict(X_test)

# Find the most common label for each cluster (cluster 0 and cluster 1)
mode_result_0 = mode(y_train[train_clusters == 0])
mode_result_1 = mode(y_train[train_clusters == 1])

# Check the mode result to debug
print("Mode result for cluster 0:", mode_result_0)
print("Mode result for cluster 1:", mode_result_1)

# Safe access to mode value
if isinstance(mode_result_0.mode, np.ndarray):
    cluster_0_mode = mode_result_0.mode[0]
else:
    cluster_0_mode = mode_result_0.mode  # If mode is scalar, take it directly

if isinstance(mode_result_1.mode, np.ndarray):
    cluster_1_mode = mode_result_1.mode[0]
else:
    cluster_1_mode = mode_result_1.mode  # If mode is scalar, take it directly

# Map clusters to actual labels
train_predictions = np.where(
    train_clusters == 0, cluster_0_mode, cluster_1_mode)
test_predictions = np.where(test_clusters == 0, cluster_0_mode, cluster_1_mode)

# Compute training and test accuracies
train_accuracy_kmeans = accuracy_score(y_train, train_predictions)
test_accuracy_kmeans = accuracy_score(y_test, test_predictions)

# Get classification report for detailed metrics
class_report_kmeans = classification_report(y_test, test_predictions)

# Display results
print(f"K-Means Training Accuracy: {train_accuracy_kmeans:.4f}")
print(f"K-Means Test Accuracy: {test_accuracy_kmeans:.4f}")
print(f"K-Means Classification Report:\n{class_report_kmeans}")

Mode result for cluster 0: ModeResult(mode=np.int64(1), count=np.int64(15134))
Mode result for cluster 1: ModeResult(mode=np.int64(0), count=np.int64(23396))
K-Means Training Accuracy: 0.7040
K-Means Test Accuracy: 0.7028
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.85      0.74      6988
           1       0.78      0.55      0.65      6696

    accuracy                           0.70     13684
   macro avg       0.72      0.70      0.69     13684
weighted avg       0.72      0.70      0.70     13684



## SVM

In [20]:
# Initialize SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions on training and test sets
y_pred_train_svm = svm_model.predict(X_train)
y_pred_test_svm = svm_model.predict(X_test)

# Compute training and test accuracies
train_accuracy_svm = accuracy_score(y_train, y_pred_train_svm)
test_accuracy_svm = accuracy_score(y_test, y_pred_test_svm)

# Get classification report for detailed metrics
class_report_svm = classification_report(y_test, y_pred_test_svm)

# Display results
print(f"SVM Training Accuracy: {train_accuracy_svm:.4f}")
print(f"SVM Test Accuracy: {test_accuracy_svm:.4f}")
print(f"SVM Classification Report:\n{class_report_svm}")

SVM Training Accuracy: 0.7269
SVM Test Accuracy: 0.7206
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.81      0.75      6988
           1       0.76      0.63      0.69      6696

    accuracy                           0.72     13684
   macro avg       0.73      0.72      0.72     13684
weighted avg       0.73      0.72      0.72     13684



# Saving the Models 

In [26]:
import joblib

# Save the SVM trained model
joblib.dump(svm_model, "../models/svm_model_linear.pkl")
print("✅ SVM model saved to models/svm_model_linear.pkl")

# Save the KMeans model
joblib.dump(kmeans, "../models/kmeans_model.pkl")
print("✅ KMeans model saved to models/kmeans_model.pkl")

# Save the KNN model
joblib.dump(knn_model, "../models/knn_model.pkl")
print("✅ KNN model saved to models/knn_model.pkl")

# Save the Random Forest model
joblib.dump(rf_model, "../models/random_forest_model.pkl")
print("✅ Random Forest model saved to models/random_forest_model.pkl")

# Save the Logistic Regression model
joblib.dump(logreg_model, "../models/logistic_regression_model.pkl")
print("✅ Logistic Regression model saved to models/logistic_regression_model.pkl")

✅ SVM model saved to models/svm_model_linear.pkl
✅ KMeans model saved to models/kmeans_model.pkl
✅ KNN model saved to models/knn_model.pkl


✅ Random Forest model saved to models/random_forest_model.pkl
✅ Logistic Regression model saved to models/logistic_regression_model.pkl


# ✅ Code to Load All Saved Models

In [30]:
import os

# Define model paths
model_paths = {
    "svm": "../models/svm_model_linear.pkl",
    "kmeans": "../models/kmeans_model.pkl",
    "knn": "../models/knn_model.pkl",
    "random_forest": "../models/random_forest_model.pkl",
    "logistic_regression": "../models/logistic_regression_model.pkl"
}

# Load models
models = {}

for name, path in model_paths.items():
    if os.path.exists(path):
        models[name] = joblib.load(path)
        print(f"✅ Loaded: {name} model from {path}")
    else:
        print(f"❌ File not found for: {name} model at {path}")

✅ Loaded: svm model from ../models/svm_model_linear.pkl
✅ Loaded: kmeans model from ../models/kmeans_model.pkl
✅ Loaded: knn model from ../models/knn_model.pkl
✅ Loaded: random_forest model from ../models/random_forest_model.pkl
✅ Loaded: logistic_regression model from ../models/logistic_regression_model.pkl
