# Import Required Libraries 

In [351]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

# Load Data

In [352]:
DF = pd.read_csv("./data/cardio_train.csv")

In [353]:
DF

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


## Add BMI column

In [354]:
DF["BMI"] = DF["weight"]/((DF["height"]/100)**2)

In [355]:
DF.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997,27.556513
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003,6.091511
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0,3.471784
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,23.875115
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,26.374068
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0,30.222222
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0,298.666667


In [356]:
y = DF[["cardio"]]

In [357]:
y

Unnamed: 0,cardio
0,0
1,1
2,1
3,1
4,0
...,...
69995,0
69996,1
69997,1
69998,1


In [358]:
X = DF[[col for col in DF.columns if col not in ["cardio", "id"]]]

In [359]:
X

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,BMI
0,18393,2,168,62.0,110,80,1,1,0,0,1,21.967120
1,20228,1,156,85.0,140,90,3,1,0,0,1,34.927679
2,18857,1,165,64.0,130,70,3,1,0,0,0,23.507805
3,17623,2,169,82.0,150,100,1,1,0,0,1,28.710479
4,17474,1,156,56.0,100,60,1,1,0,0,0,23.011177
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,26.927438
69996,22601,1,158,126.0,140,90,2,2,0,0,1,50.472681
69997,19066,2,183,105.0,180,90,3,1,0,1,0,31.353579
69998,22431,1,163,72.0,135,80,1,2,0,0,0,27.099251


## Scale Data

In [360]:
scaler = StandardScaler().fit(X)

scaled = scaler.transform(X)

X = pd.DataFrame(X, columns=X.columns)
X2 = DF[[col for col in DF.columns if col != "BMI"]]

In [361]:
y = y.values.reshape((y.shape[0],))

# Multi-layer Perceptron Classifer

# Model A Configuration

- Hidden Layer Sizes = (32, 64)
- Alpha = 0.0001
- Max Iteration = 1000
- Tolerance = 0.00001

In [362]:
model_A = MLPClassifier(hidden_layer_sizes=(32, 64), alpha=0.0001, max_iter=1000, tol=0.000001)

## Train Model A

In [363]:
model_A = model_A.fit(X, y)

## Grade Model A Performance

In [364]:
model_A_results = cross_validate(model_A, X, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [365]:
model_A_results = pd.DataFrame(model_A_results)
model_A_results

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,13.134919,0.015504,0.510643,0.511393
1,34.850565,0.021515,0.638571,0.638089
2,18.450644,0.020016,0.500786,0.500036
3,41.310773,0.027034,0.502214,0.501714
4,12.992151,0.032535,0.670357,0.675643


In [366]:
print(f"Average train accuracy: {model_A_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {model_A_results['test_accuracy'].mean() * 100}%")

Average train accuracy: 56.537499999999994%
Average test accuracy: 56.45142857142857%


## Train Model A without BMI column

In [367]:
model_A_without_BMI = MLPClassifier(hidden_layer_sizes=(32, 64), alpha=0.0001, max_iter=1000, tol=0.000001)

In [368]:
model_A_without_BMI = model_A_without_BMI.fit(X2, y)

## Grade Model A Performance without BMI Column

In [369]:
model_A_without_BMI_results = cross_validate(model_A_without_BMI, X2, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [370]:
model_A_without_BMI_results = pd.DataFrame(model_A_without_BMI_results)
model_A_without_BMI_results

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,77.328713,0.02681,0.584071,0.983571
1,44.880671,0.023637,0.892,0.707696
2,26.60544,0.017415,0.518,0.519429
3,70.083117,0.022323,0.9835,0.956911
4,41.877659,0.028703,0.710857,0.782143


In [371]:
print(f"Average train accuracy: {model_A_without_BMI_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {model_A_without_BMI_results['test_accuracy'].mean() * 100}%")

Average train accuracy: 78.99499999999999%
Average test accuracy: 73.76857142857143%


# Model B Configuration

- Hidden Layer Sizes = (24, 48)
- Alpha = 0.0001
- Max Iteration = 1000
- Tolerance = 0.00001

In [372]:
model_B = MLPClassifier(hidden_layer_sizes=(24, 48), alpha=0.0001, max_iter=500, tol=0.000001)

## Train Model B

In [373]:
model_B = model_B.fit(X, y)

## Grade Model B Performance 

In [374]:
model_B_results = cross_validate(model_B, X, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [375]:
model_B_results = pd.DataFrame(model_B_results)
model_B_results

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,14.128458,0.014856,0.699714,0.704893
1,27.499239,0.026452,0.544786,0.542071
2,11.986875,0.016552,0.54,0.538464
3,12.505774,0.02913,0.552857,0.554518
4,16.101966,0.016623,0.5155,0.514429


In [376]:
print(f"Average train accuracy: {model_B_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {model_B_results['test_accuracy'].mean() * 100}%")

Average train accuracy: 57.087500000000006%
Average test accuracy: 57.05714285714285%


## Train Model B without BMI column

In [377]:
model_B_without_BMI = MLPClassifier(hidden_layer_sizes=(24, 48), alpha=0.0001, max_iter=500, tol=0.000001)

In [378]:
model_B_without_BMI = model_B_without_BMI.fit(X2, y)

## Grade Model B Performance without BMI Column

In [379]:
model_B_without_BMI_results = cross_validate(model_B_without_BMI, X2, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [380]:
model_B_without_BMI_results = pd.DataFrame(model_B_without_BMI_results)
model_B_without_BMI_results

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,36.779572,0.014457,0.890786,0.585411
1,30.292303,0.01791,0.509143,0.509804
2,37.900203,0.016496,0.680071,0.678929
3,30.250856,0.013781,0.554,0.756554
4,22.372695,0.014303,0.500286,0.599482


In [381]:
print(f"Average train accuracy: {model_B_without_BMI_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {model_B_without_BMI_results['test_accuracy'].mean() * 100}%")

Average train accuracy: 62.603571428571435%
Average test accuracy: 62.6857142857143%


# Model C Configuration

- Hidden Layer Sizes = (50,)
- Alpha = 0.0001
- Max Iteration = 1000
- Tolerance = 0.00001

In [397]:
model_C = MLPClassifier(hidden_layer_sizes=(50,), alpha=0.0001, max_iter=500, tol=0.000001)

## Train Model C  

In [None]:
model_C = model_C.fit(X, y)

## Grade Model C Performance 

In [384]:
model_C_results = cross_validate(model_C, X, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [385]:
model_C_results = pd.DataFrame(model_C_results)
model_C_results

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,13.078813,0.010997,0.684857,0.695268
1,8.317873,0.016081,0.643286,0.641125
2,14.110397,0.011403,0.508357,0.508875
3,17.973523,0.01642,0.608357,0.608393
4,24.952249,0.018535,0.559143,0.562643


In [386]:
print(f"Average train accuracy: {model_C_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {model_C_results['test_accuracy'].mean() * 100}%")

Average train accuracy: 60.326071428571424%
Average test accuracy: 60.08%


## Train Model C without BMI column

In [396]:
model_C_without_BMI = MLPClassifier(hidden_layer_sizes=(50), alpha=0.0001, max_iter=500, tol=0.000001)

In [388]:
model_C_without_BMI = model_C_without_BMI.fit(X2, y)

## Grade Model C Performance without BMI Column

In [395]:
model_C_without_BMI_results = cross_validate(model_C_without_BMI, X2, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [390]:
model_C_without_BMI_results = pd.DataFrame(model_C_without_BMI_results)
model_C_without_BMI_results

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,7.483905,0.014368,0.530286,0.532196
1,14.206656,0.00901,0.575643,0.571679
2,14.813291,0.012847,0.663643,0.663571
3,11.573443,0.012598,0.547643,0.550071
4,35.168974,0.015866,0.648214,0.652482


In [391]:
print(f"Average train accuracy: {model_C_without_BMI_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {model_C_without_BMI_results['test_accuracy'].mean() * 100}%")

Average train accuracy: 59.400000000000006%
Average test accuracy: 59.30857142857143%


# Logistic Regression Classifier

In [392]:
regression_model = LogisticRegression(max_iter=1000)

In [393]:
regression_model.fit(X, y)

In [394]:
print(f'Accuracy: {logisticRegr.score(X, y) * 100}%')

Accuracy: 70.91714285714286%
