In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn import preprocessing

  from numpy.core.umath_tests import inner1d


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
X = pd.read_pickle('../../data/preprocessed/merged/mergedX.pkl')

In [4]:
# Apply label encoding to X
label_encoders = {}
string_columns = X.select_dtypes(include=['object'])
for column in string_columns:
    label_encoders[column] = preprocessing.LabelEncoder().fit(string_columns[column])
    X[column] = label_encoders[column].transform(string_columns[column])

In [5]:
# Load Y and merge with X
Y = pd.read_csv('../../data/preprocessed/merged/Y.csv')
Y = Y.merge(X, how='right', on='SEQN').drop(columns=[column for column in list(X.columns) if column != 'SEQN'])

In [6]:
Y = Y.drop(columns=['Unnamed: 0'])
Y = Y.applymap(lambda x: int(x))

In [7]:
diseases = list(Y.columns.values)
diseases.remove('SEQN')
diseases

count = {}
for disease in diseases:
    count[disease] = Y[disease].sum()
sorted_count = sorted(count.items(), key=lambda kv: kv[1], reverse=True)

other_disease = sorted_count[7:]
other_disease_count = sum([count[1] for count in other_disease])

common_diseases = sorted_count[:7]
common_disease_name = [cd[0] for cd in common_diseases]
common_disease_name
Y_most_common = Y.loc[:, common_disease_name]
Y_most_common

Unnamed: 0,Dental Care,Hypertension,HighCholesterol,Overweight,Arthritis,GenericProblem,SinusInfection
0,0,1,0,1,1,0,1
1,1,0,0,0,1,0,1
2,1,1,0,1,0,0,0
3,0,1,0,1,1,1,0
4,0,1,0,1,1,0,1
5,0,0,0,0,0,1,1
6,1,1,1,0,1,1,0
7,1,0,1,0,1,1,1
8,0,0,1,0,0,0,0
9,1,0,1,1,0,0,1


In [9]:
scores_rf = {}
n_estimators = [5, 10, 15, 20]
max_depths = [15, 30, 50, 65, 75, 100]

for n_estimator in n_estimators:
    for max_depth in max_depths:
        rfc = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, random_state=0, n_jobs=-1)
        scores_rf[str(n_estimator) + ' ' + str(max_depth)] = cross_val_score(rfc, X[X.columns[1:]].as_matrix(), Y[Y.columns[1:]].as_matrix(), cv=5, scoring='f1_samples')

for key in scores_rf:
    print("n_estimators - max_depths", key, " Fscore: %0.2f (+/- %0.2f)" % (scores_rf[key].mean(), scores_rf[key].std() * 2))

n_estimators - max_depths 5 15  Fscore: 0.09 (+/- 0.07)
n_estimators - max_depths 5 30  Fscore: 0.15 (+/- 0.03)
n_estimators - max_depths 5 50  Fscore: 0.19 (+/- 0.03)
n_estimators - max_depths 5 65  Fscore: 0.20 (+/- 0.03)
n_estimators - max_depths 5 75  Fscore: 0.20 (+/- 0.03)
n_estimators - max_depths 5 100  Fscore: 0.21 (+/- 0.03)
n_estimators - max_depths 10 15  Fscore: 0.10 (+/- 0.06)
n_estimators - max_depths 10 30  Fscore: 0.15 (+/- 0.03)
n_estimators - max_depths 10 50  Fscore: 0.17 (+/- 0.02)
n_estimators - max_depths 10 65  Fscore: 0.18 (+/- 0.01)
n_estimators - max_depths 10 75  Fscore: 0.17 (+/- 0.01)
n_estimators - max_depths 10 100  Fscore: 0.15 (+/- 0.02)
n_estimators - max_depths 15 15  Fscore: 0.10 (+/- 0.07)
n_estimators - max_depths 15 30  Fscore: 0.14 (+/- 0.02)
n_estimators - max_depths 15 50  Fscore: 0.17 (+/- 0.02)
n_estimators - max_depths 15 65  Fscore: 0.18 (+/- 0.01)
n_estimators - max_depths 15 75  Fscore: 0.18 (+/- 0.02)
n_estimators - max_depths 15 100  F

In [11]:
n_neighbors = [2, 5, 10, 15, 20, 30, 50]
scores_knn = {}
for n_neighbor in n_neighbors:
    knn = KNeighborsClassifier(n_neighbors=n_neighbor, n_jobs=-1)
    scores_knn[n_neighbor] = cross_val_score(knn, X[X.columns[1:]].as_matrix(), Y[Y.columns[1:]].as_matrix(), cv=10, scoring='f1_samples')
for key in scores_knn:
    print("n_neighbor: ", key, "Fscore: %0.2f (+/- %0.2f)" % (scores_knn[key].mean(), scores_knn[key].std() * 2))


n_neighbor:  2 Fscore: 0.12 (+/- 0.02)
n_neighbor:  5 Fscore: 0.20 (+/- 0.03)
n_neighbor:  10 Fscore: 0.13 (+/- 0.02)
n_neighbor:  15 Fscore: 0.17 (+/- 0.02)
n_neighbor:  20 Fscore: 0.13 (+/- 0.02)
n_neighbor:  30 Fscore: 0.12 (+/- 0.02)
n_neighbor:  50 Fscore: 0.11 (+/- 0.02)


In [12]:
scores_rf_common = {}
n_estimators = [5, 10, 15, 20]
max_depths = [15, 30, 50, 65, 75, 100]

for n_estimator in n_estimators:
    for max_depth in max_depths:
        rfc = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, n_jobs=-1)
        scores_rf_common[str(n_estimator) + ' ' + str(max_depth)] = cross_val_score(rfc, X[X.columns[1:]].as_matrix(), Y_most_common.as_matrix(), cv=5, scoring='f1_samples')

for key in scores_rf_common:
    print("n_estimators - max_depths", key, " Fscore: %0.2f (+/- %0.2f)" % (scores_rf_common[key].mean(), scores_rf_common[key].std() * 2))

n_estimators - max_depths 5 15  Fscore: 0.26 (+/- 0.03)
n_estimators - max_depths 5 30  Fscore: 0.30 (+/- 0.04)
n_estimators - max_depths 5 50  Fscore: 0.30 (+/- 0.03)
n_estimators - max_depths 5 65  Fscore: 0.30 (+/- 0.03)
n_estimators - max_depths 5 75  Fscore: 0.30 (+/- 0.04)
n_estimators - max_depths 5 100  Fscore: 0.30 (+/- 0.06)
n_estimators - max_depths 10 15  Fscore: 0.25 (+/- 0.01)
n_estimators - max_depths 10 30  Fscore: 0.25 (+/- 0.04)
n_estimators - max_depths 10 50  Fscore: 0.22 (+/- 0.02)
n_estimators - max_depths 10 65  Fscore: 0.22 (+/- 0.03)
n_estimators - max_depths 10 75  Fscore: 0.23 (+/- 0.01)
n_estimators - max_depths 10 100  Fscore: 0.22 (+/- 0.03)
n_estimators - max_depths 15 15  Fscore: 0.24 (+/- 0.03)
n_estimators - max_depths 15 30  Fscore: 0.26 (+/- 0.04)
n_estimators - max_depths 15 50  Fscore: 0.28 (+/- 0.02)
n_estimators - max_depths 15 65  Fscore: 0.28 (+/- 0.03)
n_estimators - max_depths 15 75  Fscore: 0.28 (+/- 0.03)
n_estimators - max_depths 15 100  F

In [13]:
n_neighbors = [2, 5, 10, 15, 20, 30, 50]
scores_knn_common = {}
for n_neighbor in n_neighbors:
    knn = KNeighborsClassifier(n_neighbors=n_neighbor, n_jobs=-1)
    scores_knn_common[n_neighbor] = cross_val_score(knn, X[X.columns[1:]].as_matrix(), Y_most_common.as_matrix(), cv=10, scoring='f1_samples')
for key in scores_knn_common:
    print("n_neighbor: ", key, "Fscore: %0.2f (+/- %0.2f)" % (scores_knn_common[key].mean(), scores_knn_common[key].std() * 2))


n_neighbor:  2 Fscore: 0.17 (+/- 0.04)
n_neighbor:  5 Fscore: 0.29 (+/- 0.06)
n_neighbor:  10 Fscore: 0.20 (+/- 0.04)
n_neighbor:  15 Fscore: 0.25 (+/- 0.05)
n_neighbor:  20 Fscore: 0.20 (+/- 0.03)
n_neighbor:  30 Fscore: 0.19 (+/- 0.03)
n_neighbor:  50 Fscore: 0.18 (+/- 0.02)


In [11]:
# Cant converge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

ovsr = OneVsRestClassifier(SVC(kernel='rbf'), n_jobs=-1)
scores_ovsr = cross_val_score(ovsr, X[X.columns[1:]].as_matrix(), Y_most_common.as_matrix(), cv=10, scoring='f1_samples')

print("Fscore: %0.2f (+/- %0.2f)" % (scores_ovsr.mean(), scores_ovsr.std() * 2))

Fscore: 0.02 (+/- 0.09)


In [13]:
# from sklearn.neighbors import RadiusNeighborsClassifier
# # n_neighbors = [2, 5, 10, 15, 30, 50, 100]
# scores_rnn = {}
# radiuses = [2.0]
# weights = ['uniform', 'distance'],
# algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
# rnn = RadiusNeighborsClassifier(radius = 100.0, weights='uniform')
# scores_rnn = cross_val_score(rnn, X[X.columns[1:]].as_matrix(), Y[Y.columns[1:]].as_matrix(), cv=10, scoring='f1_samples')
# print("Fscore: %0.2f (+/- %0.2f)" % (scores_rnn.mean(), scores_rnn[key].std() * 2))
# # for n_neighbor in n_neighbors:
# #     knn = KNeighborsClassifier(n_neighbors=n_neighbor)
# #     scores_knn[n_neighbor] = cross_val_score(knn, X[X.columns[1:]].as_matrix(), Y[Y.columns[1:]].as_matrix(), cv=10, scoring='f1_samples')
# # for key in scores_knn:
# #     print("n_neighbor: ", key, "Fscore: %0.2f (+/- %0.2f)" % (scores_knn[key].mean(), scores_knn[key].std() * 2))


In [16]:
n_neighbors = [5]
scores_disease = {}

for disease in common_disease_name:
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    scores_disease[disease] = cross_val_score(knn, X[X.columns[1:]].as_matrix(), Y_most_common[disease].as_matrix(), cv=5)

for disease in scores_disease:
    print("Disease: ", disease, "accuracy score: %0.2f (+/- %0.2f)" % (scores_disease[disease].mean(), scores_disease[disease].std() * 2))

Disease:  Dental Care accuracy score: 0.50 (+/- 0.03)
Disease:  Hypertension accuracy score: 0.49 (+/- 0.04)
Disease:  HighCholesterol accuracy score: 0.50 (+/- 0.03)
Disease:  Overweight accuracy score: 0.54 (+/- 0.05)
Disease:  Arthritis accuracy score: 0.59 (+/- 0.03)
Disease:  GenericProblem accuracy score: 0.64 (+/- 0.02)
Disease:  SinusInfection accuracy score: 0.67 (+/- 0.02)


In [17]:
scores_disease = {}

for disease in common_disease_name:
    rfc = RandomForestClassifier(n_estimators=5, max_depth=50, n_jobs=-1)
    scores_disease[disease] = cross_val_score(rfc, X[X.columns[1:]].as_matrix(), Y_most_common[disease].as_matrix(), cv=5)

for disease in scores_disease:
    print("Disease: ", disease, "accuracy score: %0.2f (+/- %0.2f)" % (scores_disease[disease].mean(), scores_disease[disease].std() * 2))

Disease:  Dental Care accuracy score: 0.50 (+/- 0.03)
Disease:  Hypertension accuracy score: 0.52 (+/- 0.04)
Disease:  HighCholesterol accuracy score: 0.51 (+/- 0.02)
Disease:  Overweight accuracy score: 0.52 (+/- 0.04)
Disease:  Arthritis accuracy score: 0.59 (+/- 0.04)
Disease:  GenericProblem accuracy score: 0.63 (+/- 0.03)
Disease:  SinusInfection accuracy score: 0.65 (+/- 0.01)


In [16]:
# Y[Y.columns[1:]].iloc[[0]].as_matrix()