In [1]:
import random
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from scipy import stats
import datetime
import sys
from io import StringIO
import statsmodels.api as sm
import os
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict


os.chdir("C:/Users/austi/Downloads")
cardiodf = pd.read_csv('archive/cardio_train.csv', delimiter=';')
explncontdf = pd.DataFrame(cardiodf[['age','height','weight','ap_hi','ap_lo','cholesterol','gluc']])
explncontdf['BMI'] = round(explncontdf['weight']/((explncontdf['height']/100)**2),2)
expln1hotdf = pd.DataFrame(cardiodf[['smoke','alco','active']])
expln1hotdf['gender_1'] = (cardiodf['gender']-1)
expln1hotdf['gender_2'] = (cardiodf['gender']%2)
target = pd.DataFrame(cardiodf['cardio'])

trees = RandomForestClassifier(n_estimators=25, random_state=42)
fold = KFold(n_splits=5, random_state=42, shuffle=True)

X = pd.concat([explncontdf,expln1hotdf],axis=1)
X9 = pd.concat([X,target],axis=1)
print(X9)
predict = cross_val_predict(trees, X, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict)
f1 = f1_score(target, predict, average='weighted')

#file_name = 'Cardio_BMI_OneHot.xlsx'
#X.to_excel("Cardio_BMI_OneHot.xlsx")

print(confusion)
print(f1)

poly = PolynomialFeatures(1)
polydf = pd.DataFrame(poly.fit_transform(explncontdf))

X2 = pd.concat([polydf,expln1hotdf],axis=1)
predict2 = cross_val_predict(trees, X2, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')

print(confusion)
print(f1)

         age  height  weight  ap_hi  ap_lo  cholesterol  gluc    BMI  smoke  \
0      18393     168    62.0    110     80            1     1  21.97      0   
1      20228     156    85.0    140     90            3     1  34.93      0   
2      18857     165    64.0    130     70            3     1  23.51      0   
3      17623     169    82.0    150    100            1     1  28.71      0   
4      17474     156    56.0    100     60            1     1  23.01      0   
...      ...     ...     ...    ...    ...          ...   ...    ...    ...   
69995  19240     168    76.0    120     80            1     1  26.93      1   
69996  22601     158   126.0    140     90            2     2  50.47      0   
69997  19066     183   105.0    180     90            3     1  31.35      0   
69998  22431     163    72.0    135     80            1     2  27.10      0   
69999  20540     170    72.0    120     80            2     1  24.91      0   

       alco  active  gender_1  gender_2  cardio  
0

In [None]:
#In this section, data is split between various one hot variables, then the two datasets are each fit to a random forest learning model
#We see the influnce some particular factors have on the accuracy of the model
X2smoke = X2[X2['smoke']==1]
target2 = cardiodf[cardiodf['smoke']==1]
target2 = target2['cardio']
X2nope = X2[X2['smoke']==0]
target3 = cardiodf[cardiodf['smoke']==0]
target3 = target3['cardio']
print("Smoke: yes vs. no")
predict2 = cross_val_predict(trees, X2smoke, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2nope, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

X2alco = X2[X2['alco']==1]
target2 = cardiodf[cardiodf['alco']==1]
target2 = target2['cardio']
X2nalco = X2[X2['alco']==0]
target3 = cardiodf[cardiodf['alco']==0]
target3 = target3['cardio']
print("Alcohol: yes vs. no")
predict2 = cross_val_predict(trees, X2alco, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2nalco, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

X21gluc = X[X['gluc']==1]
target2 = cardiodf[cardiodf['gluc']==1]
target2 = target2['cardio']
X22gluc = X[X['gluc']==2]
target3 = cardiodf[cardiodf['gluc']==2]
target3 = target3['cardio']
X23gluc = X[X['gluc']==3]
target4 = cardiodf[cardiodf['gluc']==3]
target4 = target4['cardio']
print("Glucose: ok, high, vs very high")
predict2 = cross_val_predict(trees, X21gluc, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X22gluc, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

predict4 = cross_val_predict(trees, X23gluc, target4, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target4, predict4)
f1 = f1_score(target4, predict4, average='weighted')
print(confusion)
print(f1)

X2act = X2[X2['active']==1]
target2 = cardiodf[cardiodf['active']==1]
target2 = target2['cardio']
X2nact = X2[X2['active']==0]
target3 = cardiodf[cardiodf['active']==0]
target3 = target3['cardio']
print("Active: yes vs. no")
nextbest = trees.fit(X2act.values, target2)
predict2 = cross_val_predict(trees, X2act, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

nextbest = trees.fit(X2nact.values, target3)
predict3 = cross_val_predict(trees, X2nact, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

X2gen1 = X2[X2['gender_1']==1]
target2 = X9[X9['gender_1']==1]
target2 = target2['cardio']
X2gen2 = X2[X2['gender_1']==0]
target3 = X9[X9['gender_1']==0]
target3 = target3['cardio']
print("Gender 1 vs. Gender 2")
predict2 = cross_val_predict(trees, X2gen1, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

predict3 = cross_val_predict(trees, X2gen2, target3, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target3, predict3)
f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

print("Double Trouble??")
X2both = X2[X2['smoke']==1]
X2both = X2both[X2both['alco']==1]
target2 = cardiodf[cardiodf['smoke']==1]
target2 = target2[target2['alco']==1]
target2 = target2['cardio']
#X2without = X2[X2['smoke']==0 or X2['alco']==0]
#target3 = cardiodf[cardiodf['smoke']==0 or cardiodf['alco']==0]
#target3 = target3['cardio']
predict2 = cross_val_predict(trees, X2both, target2, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target2, predict2)
f1 = f1_score(target2, predict2, average='weighted')
print(confusion)
print(f1)

#predict3 = cross_val_predict(trees, X2without, target3, cv=fold, n_jobs=-1)
#confusion = confusion_matrix(target3, predict3)
#f1 = f1_score(target3, predict3, average='weighted')
print(confusion)
print(f1)

In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#param_list = {'C' : [0.1, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]}
param_list = {'C' : [800000, 900000, 1000000, 1100000, 1200000]}
model = SVC(kernel='rbf',C=1000000)
target = cardiodf['cardio']
model.fit(X,target)
predict2 = cross_val_predict(model, X, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')
print(confusion)
print(f1)
print(X)
#search = GridSearchCV(model, param_list, cv=5, n_jobs=-1)
#search.fit(X,target)
#print(search.best_params_)

[[26421  8600]
 [10301 24678]]
0.7298222353833029
         age  height  weight  ap_hi  ap_lo  cholesterol  gluc    BMI  smoke  \
0      18393     168    62.0    110     80            1     1  21.97      0   
1      20228     156    85.0    140     90            3     1  34.93      0   
2      18857     165    64.0    130     70            3     1  23.51      0   
3      17623     169    82.0    150    100            1     1  28.71      0   
4      17474     156    56.0    100     60            1     1  23.01      0   
...      ...     ...     ...    ...    ...          ...   ...    ...    ...   
69995  19240     168    76.0    120     80            1     1  26.93      1   
69996  22601     158   126.0    140     90            2     2  50.47      0   
69997  19066     183   105.0    180     90            3     1  31.35      0   
69998  22431     163    72.0    135     80            1     2  27.10      0   
69999  20540     170    72.0    120     80            2     1  24.91      0   

 

In [15]:
model = SVC(kernel='rbf',C=1000000)
model.fit(X,target)
predict2 = cross_val_predict(model, X, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')
print(confusion)
print(f1)
print(X)

[[26428  8593]
 [10302 24677]]
0.729906478383598
         age  height  weight  ap_hi  ap_lo  cholesterol  gluc    BMI  smoke  \
0      18393     168    62.0    110     80            1     1  21.97      0   
1      20228     156    85.0    140     90            3     1  34.93      0   
2      18857     165    64.0    130     70            3     1  23.51      0   
3      17623     169    82.0    150    100            1     1  28.71      0   
4      17474     156    56.0    100     60            1     1  23.01      0   
...      ...     ...     ...    ...    ...          ...   ...    ...    ...   
69995  19240     168    76.0    120     80            1     1  26.93      1   
69996  22601     158   126.0    140     90            2     2  50.47      0   
69997  19066     183   105.0    180     90            3     1  31.35      0   
69998  22431     163    72.0    135     80            1     2  27.10      0   
69999  20540     170    72.0    120     80            2     1  24.91      0   

  

In [None]:
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
pca = PCA(n_components = 5)
pca.fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
pca2 = PCA(n_components = 2)
pca2.fit(X)
components = pca2.transform(X)
print(components.shape)
predict = cross_val_predict(trees, components, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict)
f1 = f1_score(target, predict, average='weighted')
print(confusion)
print(f1)

In [9]:
from sklearn.cluster import KMeans
param_list = {'n_clusters' : [2, 3, 4, 5, 10, 20, 50, 75, 100]
cluster = KMeans(n_clusters=100)
target = cardiodf['cardio']
cluster.fit(X2,target)
predict2 = cross_val_predict(cluster, X2, target, cv=fold, n_jobs=-1)
confusion = confusion_matrix(target, predict2)
f1 = f1_score(target, predict2, average='weighted')
print(confusion)
print(f1)
search = GridSearchCV(cluster, param_list, cv=5, n_jobs=-1)
search.fit(X2,target)
print(search.best_params_)



[[649 419 553 ... 246 564 574]
 [605 529 459 ... 271 585 496]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
0.03261731172106268




{'n_clusters': 150}
