# Preparing dataset

In [2]:
import pyreadstat
from datascience import *
import pandas as pd 
import math
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as lr
from sklearn.metrics import confusion_matrix 
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn import neighbors
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from matplotlib.colors import ListedColormap
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plots
%matplotlib inline
from ipywidgets import interact, interactive, fixed, interact_manual
from mpl_toolkits import mplot3d

In [7]:
data, meta = pyreadstat.read_sav("raw_data_SIREN.sav")
def df_interact(df):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + 7, col:col + 7]
    interact(peek, row=(0, len(df), 7), col=(0, len(df.columns) - 7))
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))
df_interact(data)

interactive(children=(IntSlider(value=0, description='row', max=4236, step=7), IntSlider(value=0, description=…

(4236 rows, 34 columns) total


# Classifying stroke vs. control

In [75]:
#Specifying variables we want and dropping rows with null values
y_all = data["case_control"]
x_all = data.drop(["pid", "match_dyad_id", "case_control", "life_activity_hrs_tot_R2", "qvsfs_09", \
            "stroke4_1", "stroke4_2", "stroke4_3", "stroke4_4","stroke4_5", "stroke4_6", \
            "stroke4_7", "stroke4_8", "stroke4_9", "stroke4_10","stroke4_11", "ct3"], axis=1)
df_all = pd.concat([x_all, y_all], axis=1).dropna()
y_all = df_all["case_control"]
x_all = df_all.drop(["case_control"], axis=1)
df_interact(x_all)


interactive(children=(IntSlider(value=0, description='row', max=3159, step=7), IntSlider(value=0, description=…

(3159 rows, 17 columns) total


In [81]:
#Logistic regression
lst_lr = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_all, y_all, test_size=(i+10)/100, random_state=0)
    logisticRegr = lr()
    logisticRegr.fit(x_train, y_train)
    z = logisticRegr.score(x_test, y_test)
    lst_lr.append([(i+10)/100, z])

lst_lr



[[0.1, 0.740506329113924],
 [0.11, 0.7327586206896551],
 [0.12, 0.7236842105263158],
 [0.13, 0.7226277372262774],
 [0.14, 0.7268623024830699],
 [0.15, 0.7194092827004219],
 [0.16, 0.717391304347826],
 [0.17, 0.7156133828996283],
 [0.18, 0.7065026362038664],
 [0.19, 0.7088186356073212],
 [0.2, 0.7104430379746836],
 [0.21, 0.7183734939759037],
 [0.22, 0.7136690647482015],
 [0.23, 0.7180192572214581],
 [0.24, 0.7101449275362319],
 [0.25, 0.7050632911392405],
 [0.26, 0.7031630170316302],
 [0.27, 0.7057444314185228],
 [0.28, 0.7073446327683616],
 [0.29, 0.7044711014176663],
 [0.3, 0.70042194092827],
 [0.31, 0.7030612244897959],
 [0.32, 0.7042532146389713],
 [0.33, 0.6999041227229147],
 [0.34, 0.7004651162790698],
 [0.35, 0.705244122965642],
 [0.36, 0.70298769771529],
 [0.37, 0.7023096663815227],
 [0.38, 0.7077435470441299],
 [0.39, 0.7104622871046229],
 [0.4, 0.7080696202531646],
 [0.41, 0.7075617283950617],
 [0.42, 0.7015825169555389],
 [0.43, 0.7041942604856513],
 [0.44, 0.705035971223021

In [80]:
#KNN
lst_knn = []

for j in range (100):
    n_neighbors = j+1
    for i in range(81):
        x_train, x_test, y_train, y_test = tts(x_all, y_all, test_size=(i+10)/100, random_state=0)
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
        clf.fit(x_train, y_train)
        z = clf.score(x_test, y_test)
        lst_knn.append([n_neighbors,(i+10)/100, z])

max_accuracy = 0
best_knn = None

for x in lst_knn:
    if x[2] >= max_accuracy:
        max_accuracy = x[2]
        best_knn = x

print(best_knn, lst_knn)

[41, 0.34, 0.627906976744186] [[1, 0.1, 0.5569620253164557], [1, 0.11, 0.5517241379310345], [1, 0.12, 0.5578947368421052], [1, 0.13, 0.5498783454987834], [1, 0.14, 0.54627539503386], [1, 0.15, 0.5485232067510548], [1, 0.16, 0.5513833992094862], [1, 0.17, 0.5594795539033457], [1, 0.18, 0.5536028119507909], [1, 0.19, 0.5524126455906821], [1, 0.2, 0.5490506329113924], [1, 0.21, 0.5557228915662651], [1, 0.22, 0.5496402877697841], [1, 0.23, 0.5419532324621733], [1, 0.24, 0.541501976284585], [1, 0.25, 0.549367088607595], [1, 0.26, 0.5486618004866181], [1, 0.27, 0.5568581477139508], [1, 0.28, 0.5570621468926553], [1, 0.29, 0.5528898582333697], [1, 0.3, 0.5527426160337553], [1, 0.31, 0.5561224489795918], [1, 0.32, 0.559841740850643], [1, 0.33, 0.5637583892617449], [1, 0.34, 0.5609302325581396], [1, 0.35, 0.5587703435804702], [1, 0.36, 0.5615114235500879], [1, 0.37, 0.5637296834901625], [1, 0.38, 0.5670274771024146], [1, 0.39, 0.5660989456609895], [1, 0.4, 0.571993670886076], [1, 0.41, 0.566358

In [77]:
#SVM
lst_svm = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_all, y_all, test_size=(i+10)/100, random_state=0)
    sv = svm.SVC()
    sv.fit(x_train, y_train)
    z = sv.score(x_test, y_test)
    lst_svm.append([(i+10)/100, z])

lst_svm



[[0.1, 0.5727848101265823],
 [0.11, 0.5890804597701149],
 [0.12, 0.5868421052631579],
 [0.13, 0.583941605839416],
 [0.14, 0.5756207674943566],
 [0.15, 0.5759493670886076],
 [0.16, 0.5691699604743083],
 [0.17, 0.5631970260223048],
 [0.18, 0.5764499121265377],
 [0.19, 0.589018302828619],
 [0.2, 0.5854430379746836],
 [0.21, 0.5978915662650602],
 [0.22, 0.5928057553956835],
 [0.23, 0.5873452544704264],
 [0.24, 0.5902503293807642],
 [0.25, 0.5886075949367089],
 [0.26, 0.5924574209245742],
 [0.27, 0.6025791324736225],
 [0.28, 0.5932203389830508],
 [0.29, 0.6019629225736096],
 [0.3, 0.6012658227848101],
 [0.31, 0.6030612244897959],
 [0.32, 0.6013847675568744],
 [0.33, 0.6040268456375839],
 [0.34, 0.6111627906976744],
 [0.35, 0.608499095840868],
 [0.36, 0.6098418277680141],
 [0.37, 0.6013686911890505],
 [0.38, 0.6003330557868443],
 [0.39, 0.5928629359286294],
 [0.4, 0.5965189873417721],
 [0.41, 0.5941358024691358],
 [0.42, 0.5930670685757348],
 [0.43, 0.5894039735099338],
 [0.44, 0.58992805755

In [78]:
#Decision tree
lst_dt = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_all, y_all, test_size=(i+10)/100, random_state=0)
    tr = tree.DecisionTreeClassifier()
    tr = tr.fit(x_train, y_train)
    z = tr.score(x_test, y_test)
    lst_dt.append([(i+10)/100, z])

lst_dt

[[0.1, 0.680379746835443],
 [0.11, 0.6120689655172413],
 [0.12, 0.6105263157894737],
 [0.13, 0.6326034063260341],
 [0.14, 0.6297968397291196],
 [0.15, 0.6286919831223629],
 [0.16, 0.6403162055335968],
 [0.17, 0.6133828996282528],
 [0.18, 0.6274165202108963],
 [0.19, 0.6256239600665557],
 [0.2, 0.6392405063291139],
 [0.21, 0.6340361445783133],
 [0.22, 0.6287769784172662],
 [0.23, 0.6492434662998624],
 [0.24, 0.6574440052700923],
 [0.25, 0.6911392405063291],
 [0.26, 0.6593673965936739],
 [0.27, 0.6553341148886284],
 [0.28, 0.655367231638418],
 [0.29, 0.6423118865866958],
 [0.3, 0.6413502109704642],
 [0.31, 0.6530612244897959],
 [0.32, 0.6429277942631059],
 [0.33, 0.6423777564717162],
 [0.34, 0.6465116279069767],
 [0.35, 0.6528028933092225],
 [0.36, 0.6520210896309314],
 [0.37, 0.6355859709153122],
 [0.38, 0.6477935054121565],
 [0.39, 0.6536901865369019],
 [0.4, 0.6511075949367089],
 [0.41, 0.6504629629629629],
 [0.42, 0.6322532027128862],
 [0.43, 0.6548933038999264],
 [0.44, 0.6460431654

In [79]:
#Forests of randomized trees
lst_rt = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_all, y_all, test_size=(i+10)/100, random_state=0)
    rfc = RandomForestClassifier(n_estimators=100)
    rfc = rfc.fit(x_train, y_train)
    z = rfc.score(x_test, y_test)
    lst_rt.append([(i+10)/100, z])

lst_rt

[[0.1, 0.7215189873417721],
 [0.11, 0.6954022988505747],
 [0.12, 0.6973684210526315],
 [0.13, 0.7274939172749392],
 [0.14, 0.6975169300225733],
 [0.15, 0.7025316455696202],
 [0.16, 0.7035573122529645],
 [0.17, 0.7026022304832714],
 [0.18, 0.7065026362038664],
 [0.19, 0.7254575707154742],
 [0.2, 0.7088607594936709],
 [0.21, 0.7213855421686747],
 [0.22, 0.7194244604316546],
 [0.23, 0.71939477303989],
 [0.24, 0.7246376811594203],
 [0.25, 0.7126582278481013],
 [0.26, 0.7214111922141119],
 [0.27, 0.7174677608440797],
 [0.28, 0.7141242937853107],
 [0.29, 0.7219193020719739],
 [0.3, 0.7194092827004219],
 [0.31, 0.7142857142857143],
 [0.32, 0.7101879327398615],
 [0.33, 0.7190795781399808],
 [0.34, 0.7302325581395349],
 [0.35, 0.7160940325497287],
 [0.36, 0.7231985940246046],
 [0.37, 0.7142857142857143],
 [0.38, 0.7260616153205662],
 [0.39, 0.7088402270884022],
 [0.4, 0.7231012658227848],
 [0.41, 0.7199074074074074],
 [0.42, 0.7196684250188395],
 [0.43, 0.7189109639440765],
 [0.44, 0.7143884892

# Classifying stroke type 

In [25]:
#Specifying variables we want and dropping rows with null values
y_s = data["ct3"]
x_s = data.drop(["pid", "match_dyad_id", "case_control", "life_activity_hrs_tot_R2", "qvsfs_09", "ct3"], axis=1)
df_s = pd.concat([x_s, y_s], axis=1).dropna()
y_s = df_s["ct3"]
x_s = df_s.drop(["ct3"], axis=1)
df_interact(df_s)

interactive(children=(IntSlider(value=0, description='row', max=1188, step=7), IntSlider(value=0, description=…

(1188 rows, 29 columns) total


In [68]:
#Logistic regression
lst_lr = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_s, y_s, test_size=(i+10)/100, random_state=0)
    logisticRegr = lr()
    logisticRegr.fit(x_train, y_train)
    z = logisticRegr.score(x_test, y_test)
    lst_lr.append([(i+10)/100, z])

lst_lr



[[0.1, 0.6722689075630253],
 [0.11, 0.6793893129770993],
 [0.12, 0.6643356643356644],
 [0.13, 0.6774193548387096],
 [0.14, 0.6586826347305389],
 [0.15, 0.6759776536312849],
 [0.16, 0.6649214659685864],
 [0.17, 0.6782178217821783],
 [0.18, 0.6822429906542056],
 [0.19, 0.672566371681416],
 [0.2, 0.680672268907563],
 [0.21, 0.676],
 [0.22, 0.6603053435114504],
 [0.23, 0.6532846715328468],
 [0.24, 0.6713286713286714],
 [0.25, 0.6734006734006734],
 [0.26, 0.6796116504854369],
 [0.27, 0.6791277258566978],
 [0.28, 0.6786786786786787],
 [0.29, 0.6782608695652174],
 [0.3, 0.6582633053221288],
 [0.31, 0.6558265582655827],
 [0.32, 0.6561679790026247],
 [0.33, 0.6666666666666666],
 [0.34, 0.6683168316831684],
 [0.35, 0.65625],
 [0.36, 0.6682242990654206],
 [0.37, 0.6727272727272727],
 [0.38, 0.6703539823008849],
 [0.39, 0.6702586206896551],
 [0.4, 0.6785714285714286],
 [0.41, 0.6782786885245902],
 [0.42, 0.6833667334669339],
 [0.43, 0.675146771037182],
 [0.44, 0.6826003824091779],
 [0.45, 0.691588

In [69]:
#KNN
lst_knn = []

for j in range (100):
    n_neighbors = j+1
    for i in range(81):
        x_train, x_test, y_train, y_test = tts(x_s, y_s, test_size=(i+10)/100, random_state=0)
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
        clf.fit(x_train, y_train)
        z = clf.score(x_test, y_test)
        lst_knn.append([n_neighbors,(i+10)/100, z])

max_accuracy = 0
best_knn = None

for x in lst_knn:
    if x[2] >= max_accuracy:
        max_accuracy = x[2]
        best_knn = x

lst_knn


[[1, 0.1, 0.5798319327731093],
 [1, 0.11, 0.5801526717557252],
 [1, 0.12, 0.5944055944055944],
 [1, 0.13, 0.5935483870967742],
 [1, 0.14, 0.6047904191616766],
 [1, 0.15, 0.5810055865921788],
 [1, 0.16, 0.581151832460733],
 [1, 0.17, 0.5792079207920792],
 [1, 0.18, 0.5794392523364486],
 [1, 0.19, 0.5707964601769911],
 [1, 0.2, 0.5756302521008403],
 [1, 0.21, 0.58],
 [1, 0.22, 0.5916030534351145],
 [1, 0.23, 0.5802919708029197],
 [1, 0.24, 0.5909090909090909],
 [1, 0.25, 0.5925925925925926],
 [1, 0.26, 0.598705501618123],
 [1, 0.27, 0.5950155763239875],
 [1, 0.28, 0.5975975975975976],
 [1, 0.29, 0.5971014492753624],
 [1, 0.3, 0.5994397759103641],
 [1, 0.31, 0.5907859078590786],
 [1, 0.32, 0.5826771653543307],
 [1, 0.33, 0.5928753180661578],
 [1, 0.34, 0.6014851485148515],
 [1, 0.35, 0.6009615384615384],
 [1, 0.36, 0.602803738317757],
 [1, 0.37, 0.6],
 [1, 0.38, 0.5995575221238938],
 [1, 0.39, 0.5926724137931034],
 [1, 0.4, 0.6050420168067226],
 [1, 0.41, 0.6086065573770492],
 [1, 0.42, 0

In [70]:
best_knn

[88, 0.64, 0.6885676741130092]

In [66]:
#SVM
lst_svm = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_s, y_s, test_size=(i+10)/100, random_state=0)
    sv = svm.SVC()
    sv.fit(x_train, y_train)
    z = sv.score(x_test, y_test)
    lst_svm.append([(i+10)/100, z])

lst_svm



[[0.1, 0.7058823529411765],
 [0.11, 0.6793893129770993],
 [0.12, 0.6713286713286714],
 [0.13, 0.6580645161290323],
 [0.14, 0.6646706586826348],
 [0.15, 0.6480446927374302],
 [0.16, 0.643979057591623],
 [0.17, 0.6336633663366337],
 [0.18, 0.6495327102803738],
 [0.19, 0.6460176991150443],
 [0.2, 0.634453781512605],
 [0.21, 0.64],
 [0.22, 0.6374045801526718],
 [0.23, 0.6459854014598541],
 [0.24, 0.6503496503496503],
 [0.25, 0.6531986531986532],
 [0.26, 0.656957928802589],
 [0.27, 0.6479750778816199],
 [0.28, 0.6576576576576577],
 [0.29, 0.6550724637681159],
 [0.3, 0.6498599439775911],
 [0.31, 0.6449864498644986],
 [0.32, 0.6430446194225722],
 [0.33, 0.6539440203562341],
 [0.34, 0.6608910891089109],
 [0.35, 0.65625],
 [0.36, 0.6635514018691588],
 [0.37, 0.6659090909090909],
 [0.38, 0.6703539823008849],
 [0.39, 0.6745689655172413],
 [0.4, 0.6785714285714286],
 [0.41, 0.680327868852459],
 [0.42, 0.6713426853707415],
 [0.43, 0.6712328767123288],
 [0.44, 0.6673040152963671],
 [0.45, 0.66728971

In [67]:
#Decision tree
lst_dt = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_s, y_s, test_size=(i+10)/100, random_state=0)
    tr = tree.DecisionTreeClassifier()
    tr = tr.fit(x_train, y_train)
    z = tr.score(x_test, y_test)
    lst_dt.append([(i+10)/100, z])

lst_dt

[[0.1, 0.5546218487394958],
 [0.11, 0.5725190839694656],
 [0.12, 0.5874125874125874],
 [0.13, 0.5741935483870968],
 [0.14, 0.6047904191616766],
 [0.15, 0.6312849162011173],
 [0.16, 0.5916230366492147],
 [0.17, 0.6485148514851485],
 [0.18, 0.6448598130841121],
 [0.19, 0.5486725663716814],
 [0.2, 0.5462184873949579],
 [0.21, 0.648],
 [0.22, 0.5992366412213741],
 [0.23, 0.5875912408759124],
 [0.24, 0.6118881118881119],
 [0.25, 0.6127946127946128],
 [0.26, 0.6181229773462783],
 [0.27, 0.6573208722741433],
 [0.28, 0.6186186186186187],
 [0.29, 0.6521739130434783],
 [0.3, 0.6330532212885154],
 [0.31, 0.6368563685636857],
 [0.32, 0.6194225721784777],
 [0.33, 0.6183206106870229],
 [0.34, 0.6163366336633663],
 [0.35, 0.6346153846153846],
 [0.36, 0.6261682242990654],
 [0.37, 0.5954545454545455],
 [0.38, 0.6283185840707964],
 [0.39, 0.584051724137931],
 [0.4, 0.5819327731092437],
 [0.41, 0.6045081967213115],
 [0.42, 0.5771543086172345],
 [0.43, 0.6125244618395304],
 [0.44, 0.5927342256214149],
 [0

In [65]:
#Forests of randomized trees
lst_rt = []

for i in range(81):
    x_train, x_test, y_train, y_test = tts(x_s, y_s, test_size=(i+10)/100, random_state=0)
    rfc = RandomForestClassifier(n_estimators=100)
    rfc = rfc.fit(x_train, y_train)
    z = rfc.score(x_test, y_test)
    lst_rt.append([(i+10)/100, z])

lst_rt

[[0.1, 0.6890756302521008],
 [0.11, 0.7251908396946565],
 [0.12, 0.7132867132867133],
 [0.13, 0.6903225806451613],
 [0.14, 0.6646706586826348],
 [0.15, 0.6871508379888268],
 [0.16, 0.6701570680628273],
 [0.17, 0.6633663366336634],
 [0.18, 0.677570093457944],
 [0.19, 0.6858407079646017],
 [0.2, 0.6890756302521008],
 [0.21, 0.692],
 [0.22, 0.6641221374045801],
 [0.23, 0.6642335766423357],
 [0.24, 0.6853146853146853],
 [0.25, 0.6666666666666666],
 [0.26, 0.6634304207119741],
 [0.27, 0.6635514018691588],
 [0.28, 0.6786786786786787],
 [0.29, 0.6782608695652174],
 [0.3, 0.6638655462184874],
 [0.31, 0.6639566395663956],
 [0.32, 0.6456692913385826],
 [0.33, 0.6717557251908397],
 [0.34, 0.6707920792079208],
 [0.35, 0.6850961538461539],
 [0.36, 0.6658878504672897],
 [0.37, 0.6590909090909091],
 [0.38, 0.6570796460176991],
 [0.39, 0.6702586206896551],
 [0.4, 0.6764705882352942],
 [0.41, 0.6741803278688525],
 [0.42, 0.6933867735470942],
 [0.43, 0.675146771037182],
 [0.44, 0.6978967495219885],
 [0.