# Import Required Libraries 

In [288]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

# Load Data

In [289]:
DF = pd.read_csv("./data/cardio_train.csv")

In [290]:
DF

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


## Add BMI column

In [291]:
DF["BMI"] = DF["weight"]/((DF["height"]/100)**2)

In [292]:
DF.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997,27.556513
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003,6.091511
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0,3.471784
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,23.875115
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,26.374068
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0,30.222222
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0,298.666667


In [293]:
y = DF[["cardio"]]

In [294]:
y

Unnamed: 0,cardio
0,0
1,1
2,1
3,1
4,0
...,...
69995,0
69996,1
69997,1
69998,1


In [295]:
X = DF[[col for col in DF.columns if col not in ["cardio", "id"]]]

In [296]:
X

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,BMI
0,18393,2,168,62.0,110,80,1,1,0,0,1,21.967120
1,20228,1,156,85.0,140,90,3,1,0,0,1,34.927679
2,18857,1,165,64.0,130,70,3,1,0,0,0,23.507805
3,17623,2,169,82.0,150,100,1,1,0,0,1,28.710479
4,17474,1,156,56.0,100,60,1,1,0,0,0,23.011177
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,26.927438
69996,22601,1,158,126.0,140,90,2,2,0,0,1,50.472681
69997,19066,2,183,105.0,180,90,3,1,0,1,0,31.353579
69998,22431,1,163,72.0,135,80,1,2,0,0,0,27.099251


## Scale Data

In [297]:
scaler = StandardScaler().fit(X)

scaled = scaler.transform(X)

X = pd.DataFrame(X, columns=X.columns)
X2 = DF[[col for col in DF.columns if col != "BMI"]]

In [298]:
y = y.values.reshape((y.shape[0],))

# Multi-layer Perceptron Classifer

# Model A

- Hidden Layer Sizes = (32, 64)
- Alpha = 0.0001
- Max Iteration = 1000
- Tolerance = 0.00001

In [299]:
classifier = MLPClassifier(hidden_layer_sizes=(32, 64), alpha=0.0001, max_iter=1000, tol=0.000001)

## Train Model A

In [300]:
classifier = classifier.fit(X, y)

## Grade Model A Performance

In [301]:
cv_results = cross_validate(classifier, X, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [302]:
cv_results = pd.DataFrame(cv_results)
cv_results

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,5.45397,0.009973,0.698214,0.703143
1,8.077502,0.011033,0.563571,0.559571
2,8.9602,0.013083,0.708643,0.706161
3,8.37891,0.011825,0.499786,0.499982
4,9.635608,0.017439,0.506857,0.506839


In [303]:
print(f"Average train accuracy: {cv_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {cv_results['test_accuracy'].mean() * 100}%")

Average train accuracy: 59.51392857142858%
Average test accuracy: 59.54142857142857%


# Train Model A with BMI column

In [304]:
classifier = classifier.fit(X2, y)

## Grade Model A Performance with BMI Column

In [None]:
cv_results = cross_validate(classifier, X2, y, cv=5, scoring=["accuracy"], return_train_score=True)

In [None]:
cv_results = pd.DataFrame(cv_results)
cv_results

In [None]:
print(f"Average train accuracy: {cv_results['train_accuracy'].mean() * 100}%")
print(f"Average test accuracy: {cv_results['test_accuracy'].mean() * 100}%")

# Model B

# Model C

# Logistic Regression Classifier

In [None]:
regression_model = LogisticRegression(max_iter=1000)

In [None]:
regression_model.fit(X, y)

In [None]:
print(f'Accuracy: {logisticRegr.score(X, y) * 100}%')