In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score

In [2]:
df = pd.read_csv("novagen_dataset.csv")

In [6]:
df.head()
df.isnull().sum()

Age                      0
BMI                      0
Blood_Pressure           0
Cholesterol              0
Glucose_Level            0
Heart_Rate               0
Sleep_Hours              0
Exercise_Hours           0
Water_Intake             0
Stress_Level             0
Target                   0
Smoking                  0
Alcohol                  0
Diet                     0
MentalHealth             0
PhysicalActivity         0
MedicalHistory           0
Allergies                0
Diet_Type__Vegan         0
Diet_Type__Vegetarian    0
Blood_Group_AB           0
Blood_Group_B            0
Blood_Group_O            0
dtype: int64

In [10]:
X = df.drop(columns=["Target"])
y = df["Target"]

In [13]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


X_train.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,...,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type__Vegan,Diet_Type__Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
180,13.0,25.0,97.0,197.0,99.0,73.0,7.0,2.0,5.0,3.0,...,1,2,1,0,1,False,False,False,False,False
1115,77.0,26.0,130.0,203.0,97.0,72.0,8.0,2.0,3.0,1.0,...,0,2,1,1,1,False,True,True,False,False
4398,3.0,27.0,149.0,198.0,103.0,73.0,8.0,2.0,4.0,5.0,...,2,0,1,2,1,True,False,True,False,False
8265,1.0,23.0,147.0,198.0,104.0,73.0,3.0,4.0,2.0,9.0,...,2,0,1,1,1,True,False,True,False,False
9469,9.0,24.0,147.0,199.0,98.0,72.0,7.0,0.0,4.0,2.0,...,0,0,2,2,2,False,True,False,False,False


In [16]:
# STANDARDIZATION

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# LOGISTIC REGRESSION

In [54]:
## Training model using scaled dataset using Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 1000)  # default max_iter = 100
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 82.25130890052355 %
Recall of Model: 83.76237623762376 %


# KNN Classifier

In [44]:
# Library for KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors = 7)

# Training Model
knn_model.fit(X_train, y_train)

# Testing Model
y_pred = knn_model.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 88.06282722513089 %
Recall of Model: 88.31683168316832 %


# SUPPORT VECTOR MACHINE (Classifier)

In [53]:
# Training and Predicting using SVM Classifier(SVC) Algorithm

from sklearn.svm import SVC

svc = SVC() # default kernal = RBF(Gaussian) kernel

svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 93.35078534031413 %
Recall of Model: 94.25742574257426 %


# DECISION TREE (Classifier)

In [47]:
# DECISION TREE (Classifier) Algorithm Implementation

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=12, min_samples_split=15)

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 90.89005235602095 %
Recall of Model: 90.99009900990099 %


# GridSearchCV

In [36]:
from sklearn.model_selection import GridSearchCV

classifierCV = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid={"n_neighbors": [3, 5, 7, 9]},
    # scoring = "recall",  # usd in classification data for type2 error (FP)
    cv=5
)

classifierCV.fit(X_train, y_train)

y_pred_CV = classifierCV.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 88.06282722513089 %
Precision of Model: 88.31683168316832 %


# RANDOM FOREST (DTC)

In [49]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(
    n_estimators=201,
    oob_score=True
)

rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 94.18848167539267 %
Recall of Model: 94.35643564356435 %


# Bagging (Classifier)

In [66]:
# Bagging Regressor

from sklearn.ensemble import BaggingClassifier

# For Linear Regression
from sklearn.linear_model import LogisticRegression
base_model = LogisticRegression()


bagging = BaggingClassifier(
    base_model,
    n_estimators = 201,
    oob_score = True,
    random_state=42
    
)

bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 82.19895287958116 %
Recall of Model: 83.76237623762376 %


# VOTING

In [67]:
from sklearn.ensemble import VotingClassifier

# Voting Classifier

vc = VotingClassifier(
    estimators=[
        ("lr", lr),
        ("svc", svc),
        ("dt", dt)
    ]
)

# Model fitting
vc.fit(X_train, y_train)

# Testing(Prediction)
y_pred = vc.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 92.40837696335078 %
Recall of Model: 93.46534653465348 %


# Stacking (Classifier)

In [61]:
from sklearn.ensemble import StackingClassifier


meta_model =  LogisticRegression()


sc = StackingClassifier(
    estimators=[
        ("lr", lr),
        ("svc", svc),
        ("dt", dt)
    ],
    final_estimator=meta_model,
    cv=5 # cross validation(k-fold)
)

# Model fitting
sc.fit(X_train, y_train)

# Testing(Prediction)
y_pred = sc.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 94.34554973821989 %
Recall of Model: 94.45544554455445 %


# Ada Boost (Classifier)

In [63]:
base_model_dts = DecisionTreeClassifier(max_depth=12, min_samples_split=15)


# AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(
    estimator= base_model_dts, # default too
    n_estimators=300,
    random_state=40
)

# Model Training
abc.fit(X_train, y_train)

# Prediction(Testing)
y_pred = abc.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 95.07853403141361 %
Recall of Model: 94.85148514851485 %


# Gradient Boosting

In [64]:
# Applying Gradient Boosting Classifier

from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(
    n_estimators=200, # No. of models
    learning_rate=0.1,
    random_state=42,
    
    # add DT Hyperparameters
    max_depth = 3
)

gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 92.98429319371728 %
Recall of Model: 93.26732673267327 %


- Among all supervises algorithms with their pruning, parameter turing, bagging, boosting
- Ada Boost (Classifier) 
  with base model DecisionTreeClassifier(max_depth=12, min_samples_split=15) Giving best recall score for predictions.
- so our best model is AdaBoost Classifier

# Best Model: AdaBoost Classifier 

In [68]:
base_model_dts = DecisionTreeClassifier(max_depth=12, min_samples_split=15)

best_model = AdaBoostClassifier(
    estimator= base_model_dts, # default too
    n_estimators=300,
    random_state=40
)

# Model Training
best_model.fit(X_train, y_train)

# Prediction(Testing)
y_pred = best_model.predict(X_test)

# Calculation of accuracy of MODEL
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Model: {accuracy*100} %")

# Calculation of Recall of MODEL
recall = recall_score(y_test, y_pred)
print(f"Recall of Model: {recall*100} %")

Accuracy of Model: 95.07853403141361 %
Recall of Model: 94.85148514851485 %
