In [None]:
import random
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from scipy import stats
import datetime
import sys
from io import StringIO
import statsmodels.api as sm
import os
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

In [None]:
#This code preprocesses the data to add BMI and make one hot versions of some variables
os.chdir("C:/Users/austi/Downloads")
cardiodf = pd.read_csv('archive/cardio_train.csv', delimiter=';')
explncontdf = pd.DataFrame(cardiodf[['age','height','weight','ap_hi','ap_lo','cholesterol','gluc']])
explncontdf['BMI'] = round(explncontdf['weight']/((explncontdf['height']/100)**2),2)
expln1hotdf = pd.DataFrame(cardiodf[['smoke','alco','active']])
expln1hotdf['gender_1'] = (cardiodf['gender']-1)
expln1hotdf['gender_2'] = (cardiodf['gender']%2)
target = pd.DataFrame(cardiodf['cardio'])

#file_name = 'Cardio_BMI_OneHot.xlsx'
#X.to_excel("Cardio_BMI_OneHot.xlsx")

In [None]:
#The cells above should go first

In [None]:
#This area is for everything else

In [1]:
#The cells below should go at the end

In [5]:
#This code sets the default classfier use for data split testing in the next cell
trees = RandomForestClassifier(n_estimators=25, random_state=42)
fold = KFold(n_splits=5, random_state=42, shuffle=True)

#This code runs a test for the data without polynomial features
X = pd.concat([explncontdf,expln1hotdf],axis=1)
#This creates a version of the dataframe that includes the target
X9 = pd.concat([X,target],axis=1)

predict = cross_val_predict(trees, X, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict)
f1 = f1_score(target, predict, average='weighted')

print(confusion)
print(f1)

#this code tests the baseline against versions with polynomial features added
poly = PolynomialFeatures(1)
#poly2 = PolynomialFeatures(2)
#poly3 = PolynomialFeatures(3)
polydf = pd.DataFrame(poly.fit_transform(explncontdf))

X2 = pd.concat([polydf,expln1hotdf],axis=1)
predict2 = cross_val_predict(trees, X2, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')

print(confusion)
print(f1)

[[24760 10261]
 [10284 24695]]
0.7064999104524738
[[24767 10254]
 [10283 24696]]
0.7066141624322843


In [4]:
#This code searches for the best hyperparameters for the final version of the decision tree model
#param_list = {'n_estimators' : [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]}
#param_list = {'n_estimators' : [100,200,300,400,500,600,700,800,900,1000]}
param_list = {'n_estimators' : [850, 900, 950]}
search = GridSearchCV(trees, param_list, cv=5, n_jobs=-1)
search.fit(X,target)
print(search.best_params_)

  self.best_estimator_.fit(X, y, **fit_params)


{'n_estimators': 950}


In [5]:
#This code tests the final random forest model without and with polynomial features
trees2 = RandomForestClassifier(n_estimators=950, random_state=42)
predict = cross_val_predict(trees2, X, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict)
f1 = f1_score(target, predict, average='weighted')
print(confusion)
print(f1)

predict2 = cross_val_predict(trees2, X2, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')

print(confusion)
print(f1)

[[25166  9855]
 [10202 24777]]
0.7134635351672255
[[25093  9928]
 [10170 24809]]
0.7128816871192972


In [12]:
#In this section, data is split between various one hot variables, then the two datasets are each fit to a random forest learning model
#We see the influnce some particular factors have on the accuracy of the model
X2smoke = X2[X2['smoke']==1]
target2 = cardiodf[cardiodf['smoke']==1]
target2 = target2['cardio']
X2nope = X2[X2['smoke']==0]
target3 = cardiodf[cardiodf['smoke']==0]
target3 = target3['cardio']
print("Smoke: yes vs. no")
predict2 = cross_val_predict(trees, X2smoke, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2nope, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

X2alco = X2[X2['alco']==1]
target2 = cardiodf[cardiodf['alco']==1]
target2 = target2['cardio']
X2nalco = X2[X2['alco']==0]
target3 = cardiodf[cardiodf['alco']==0]
target3 = target3['cardio']
print("Alcohol: yes vs. no")
predict2 = cross_val_predict(trees, X2alco, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2nalco, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

X1gluc = X[X['gluc']==1]
target2 = cardiodf[cardiodf['gluc']==1]
target2 = target2['cardio']
X2gluc = X[X['gluc']==2]
target3 = cardiodf[cardiodf['gluc']==2]
target3 = target3['cardio']
X3gluc = X[X['gluc']==3]
target4 = cardiodf[cardiodf['gluc']==3]
target4 = target4['cardio']
print("Glucose: ok, high, vs very high")
predict2 = cross_val_predict(trees, X1gluc, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2gluc, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

predict4 = cross_val_predict(trees, X3gluc, target4, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target4, predict4)
f1 = f1_score(target4, predict4, average='weighted')
print(confusion)
print(f1)

X1chol = X[X['cholesterol']==1]
target2 = cardiodf[cardiodf['cholesterol']==1]
target2 = target2['cardio']
X2chol = X[X['cholesterol']==2]
target3 = cardiodf[cardiodf['cholesterol']==2]
target3 = target3['cardio']
X3chol = X[X['cholesterol']==3]
target4 = cardiodf[cardiodf['cholesterol']==3]
target4 = target4['cardio']
print("Cholesterol: ok, high, vs very high")
predict2 = cross_val_predict(trees, X1chol, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2chol, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

predict4 = cross_val_predict(trees, X3chol, target4, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target4, predict4)
f1 = f1_score(target4, predict4, average='weighted')
print(confusion)
print(f1)

X2act = X2[X2['active']==1]
target2 = cardiodf[cardiodf['active']==1]
target2 = target2['cardio']
X2nact = X2[X2['active']==0]
target3 = cardiodf[cardiodf['active']==0]
target3 = target3['cardio']
print("Active: yes vs. no")
nextbest = trees.fit(X2act.values, target2)
predict2 = cross_val_predict(trees, X2act, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

nextbest = trees.fit(X2nact.values, target3)
predict3 = cross_val_predict(trees, X2nact, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

X2gen1 = X2[X2['gender_1']==1]
target2 = X9[X9['gender_1']==1]
target2 = target2['cardio']
X2gen2 = X2[X2['gender_1']==0]
target3 = X9[X9['gender_1']==0]
target3 = target3['cardio']
print("Gender 1 vs. Gender 2")
predict2 = cross_val_predict(trees, X2gen1, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2gen2, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

print("Double Trouble??")
X2both = X2[X2['smoke']==1]
X2both = X2both[X2both['alco']==1]
target2 = cardiodf[cardiodf['smoke']==1]
target2 = target2[target2['alco']==1]
target2 = target2['cardio']
predict2 = cross_val_predict(trees, X2both, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)


Smoke: yes vs. no
[[2518  722]
 [ 852 2077]]
0.7444669607486671
[[22263  9518]
 [ 9506 22544]]
0.701962749374053
Alcohol: yes vs. no
[[1487  454]
 [ 493 1330]]
0.7482970276030072
[[23326  9754]
 [ 9887 23269]]
0.70346890000531
Glucose: ok, high, vs very high
[[22647  8247]
 [ 8987 19598]]
0.7100653973730725
[[1240  872]
 [ 697 2381]]
0.6953330332856714
[[ 950 1065]
 [ 693 2623]]
0.6622202332145022
Cholesterol: ok, high, vs very high
[[22229  7101]
 [ 8719 14336]]
0.6965670862602673
[[2236 1563]
 [1348 4402]]
0.6935099563652587
[[ 348 1544]
 [ 436 5738]]
0.7138121387390192
Active: yes vs. no
[[20755  7888]
 [ 8302 19316]]
0.7121798556813932
[[4080 2298]
 [2028 5333]]
0.6845608552232465
Gender 1 vs. Gender 2
[[8497 3610]
 [3725 8638]]
0.7002533158380602
[[16300  6614]
 [ 6652 15964]]
0.7086298767183843
Double Trouble??
[[816 214]
 [241 583]]
0.7541283130298531


In [3]:
from sklearn.svm import SVC

#This code searches for the best parameter for the penalty on the SVC
#param_list = {'C' : [0.1, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]}
#param_list = {'C' : [800000, 900000, 1000000, 1100000, 1200000]}
#search = GridSearchCV(model, param_list, cv=5, n_jobs=-1)
#search.fit(X2,target)
#print(search.best_params_)

model = SVC(kernel='rbf', C=1000000, random_state=42)
target = cardiodf['cardio']

#This code tests the final version of the model
model.fit(X,target)
predict2 = cross_val_predict(model, X, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')
print(confusion)
print(f1)


In [21]:
#This code compares the final model with and without polynomial features included (X without, X2 with)
model = SVC(kernel='rbf', C=1000000, random_state=42)
model.fit(X,target)
predict2 = cross_val_predict(model, X, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')
print(confusion)
print(f1)

model.fit(X2,target)
predict2 = cross_val_predict(model, X2.values, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')
print(confusion)
print(f1)

[[26428  8593]
 [10302 24677]]
0.729906478383598




[[26458  8563]
 [10287 24692]]
0.7305468601762154


In [None]:
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt

#This code checks to see how many principal components are needed to explain the variance in the data
pca = PCA(n_components = 5)
pca.fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
pca2 = PCA(n_components = 2)
pca2.fit(X)
components = pca2.transform(X)
print(components.shape)

#This code tests a random forest model on the optimal number of components
predict = cross_val_predict(trees, components, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict)
f1 = f1_score(target, predict, average='weighted')
print(confusion)
print(f1)

In [20]:
from sklearn.cluster import KMeans

#This code tests the unsupervised learning model on 2 clusters
cluster = KMeans(n_clusters=2, random_state=42)
target = cardiodf['cardio']
cluster.fit(X2.values,target)
predict2 = cross_val_predict(cluster, X2.values, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')
print(confusion)
print(f1)


[[17798 17223]
 [16351 18628]]
0.5203005745334278
