In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn import preprocessing

  from numpy.core.umath_tests import inner1d


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
X = pd.read_pickle('../../data/preprocessed/merged/mergedX.pkl')

In [14]:
sensitive_variables = ['RIAGENDR', 'RIDAGEYR', 'DMQMILIZ', 'DMQADFC', 'DMDBORN4', 'DMDCITZN', 'DMDEDUC3', 'DMDEDUC2',
                           'DMDMARTL', 'RIDEXPRG','SIALANG','SIAINTRP','FIALANG','FIAINTRP','MIALANG','MIAINTRP','AIALANGA',
                           'DMDHRGND','DMDHRBR4','INDHHIN2','INDFMIN2', 'INDFMPIR']
exists = [var in list(X.columns) for var in sensitive_variables]
for idx, e in enumerate(sensitive_variables):
    if not exists[idx]:
        print(e)

DMQMILIZ
DMQADFC
DMDEDUC3
DMDEDUC2
DMDMARTL
RIDEXPRG
AIALANGA


In [4]:
def encoding_string_labels(df):
    label_encoders = {}
    string_columns = df.select_dtypes(include=['object'])
    for column in string_columns:
        label_encoders[column] = preprocessing.LabelEncoder().fit(string_columns[column])
        df[column] = label_encoders[column].transform(string_columns[column])
    return df, label_encoders

In [5]:
X_transformed, X_transformed_labels = encoding_string_labels(X)

In [7]:
X_no_age_and_gender = X_transformed.drop(columns=['RIAGENDR', 'RIDAGEYR'])

In [8]:
# Load Y and merge with X
Y = pd.read_csv('../../data/preprocessed/merged/Y.csv')
Y = Y.merge(X_no_age_and_gender, how='right', on='SEQN').drop(columns=[column for column in list(X_no_age_and_gender.columns) if column != 'SEQN'])
Y = Y.drop(columns=['Unnamed: 0'])
Y = Y.applymap(lambda x: int(x))
Y

Unnamed: 0,SEQN,LungCancer,Retinopathy,LarynxCancer,KidneyCancer,HeartFailure,Jaundice,ChronicBronchitis,SkinCancerOther,EsophagealCancer,...,SinusInfection,Gonorrhea,Memory,BladderCancer,LeukemiaCancer,Dental Care,FractureWrist,LiverCondition,Melanoma,ProstateCancer
0,73564,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,73566,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,73600,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,73607,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,73613,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,73614,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
6,73615,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,1,0,0,0
7,73616,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
8,73621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
9,73622,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [9]:
diseases = list(Y.columns.values)
diseases.remove('SEQN')
diseases

count = {}
for disease in diseases:
    count[disease] = Y[disease].sum()
sorted_count = sorted(count.items(), key=lambda kv: kv[1], reverse=True)

other_disease = sorted_count[7:]
other_disease_count = sum([count[1] for count in other_disease])

common_diseases = sorted_count[:7]
common_disease_name = [cd[0] for cd in common_diseases]
common_disease_name
Y_most_common = Y.loc[:, common_disease_name]
Y_most_common

Unnamed: 0,Dental Care,Hypertension,HighCholesterol,Overweight,Arthritis,GenericProblem,SinusInfection
0,0,1,0,1,1,0,1
1,1,0,0,0,1,0,1
2,1,1,0,1,0,0,0
3,0,1,0,1,1,1,0
4,0,1,0,1,1,0,1
5,0,0,0,0,0,1,1
6,1,1,1,0,1,1,0
7,1,0,1,0,1,1,1
8,0,0,1,0,0,0,0
9,1,0,1,1,0,0,1


In [12]:
scores_rf = {}
n_estimators = [5]
max_depths = [15, 30, 50, 65, 75, 100]

for n_estimator in n_estimators:
    for max_depth in max_depths:
        rfc = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, random_state=0, n_jobs=-1)
        scores_rf[str(n_estimator) + ' ' + str(max_depth)] = cross_val_score(rfc, X_no_age_and_gender[X_no_age_and_gender.columns[1:]].as_matrix(), Y[Y.columns[1:]].as_matrix(), cv=5, scoring='f1_samples')

for key in scores_rf:
    print("n_estimators - max_depths", key, " Fscore: %0.2f (+/- %0.2f)" % (scores_rf[key].mean(), scores_rf[key].std() * 2))

n_estimators - max_depths 5 15  Fscore: 0.10 (+/- 0.04)
n_estimators - max_depths 5 30  Fscore: 0.16 (+/- 0.02)
n_estimators - max_depths 5 50  Fscore: 0.19 (+/- 0.03)
n_estimators - max_depths 5 65  Fscore: 0.20 (+/- 0.03)
n_estimators - max_depths 5 75  Fscore: 0.20 (+/- 0.02)
n_estimators - max_depths 5 100  Fscore: 0.20 (+/- 0.02)


In [13]:
scores_rf_common = {}
n_estimators = [5]
max_depths = [15, 30, 50, 65, 75, 100]

for n_estimator in n_estimators:
    for max_depth in max_depths:
        rfc = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, n_jobs=-1)
        scores_rf_common[str(n_estimator) + ' ' + str(max_depth)] = cross_val_score(rfc, X_no_age_and_gender[X_no_age_and_gender.columns[1:]].as_matrix(), Y_most_common.as_matrix(), cv=5, scoring='f1_samples')

for key in scores_rf_common:
    print("n_estimators - max_depths", key, " Fscore: %0.2f (+/- %0.2f)" % (scores_rf_common[key].mean(), scores_rf_common[key].std() * 2))

n_estimators - max_depths 5 15  Fscore: 0.28 (+/- 0.02)
n_estimators - max_depths 5 30  Fscore: 0.29 (+/- 0.04)
n_estimators - max_depths 5 50  Fscore: 0.31 (+/- 0.04)
n_estimators - max_depths 5 65  Fscore: 0.30 (+/- 0.03)
n_estimators - max_depths 5 75  Fscore: 0.30 (+/- 0.05)
n_estimators - max_depths 5 100  Fscore: 0.30 (+/- 0.05)


In [15]:
n_neighbors = [2, 5, 10, 15, 20, 30, 50]
scores_knn = {}
for n_neighbor in n_neighbors:
    knn = KNeighborsClassifier(n_neighbors=n_neighbor, n_jobs=-1)
    scores_knn[n_neighbor] = cross_val_score(knn, X_no_age_and_gender[X_no_age_and_gender.columns[1:]].as_matrix(), Y[Y.columns[1:]].as_matrix(), cv=10, scoring='f1_samples')
for key in scores_knn:
    print("n_neighbor: ", key, "Fscore: %0.2f (+/- %0.2f)" % (scores_knn[key].mean(), scores_knn[key].std() * 2))


n_neighbor:  2 Fscore: 0.12 (+/- 0.02)
n_neighbor:  5 Fscore: 0.20 (+/- 0.03)
n_neighbor:  10 Fscore: 0.13 (+/- 0.02)
n_neighbor:  15 Fscore: 0.17 (+/- 0.02)
n_neighbor:  20 Fscore: 0.13 (+/- 0.02)
n_neighbor:  30 Fscore: 0.12 (+/- 0.02)
n_neighbor:  50 Fscore: 0.11 (+/- 0.02)


In [16]:
n_neighbors = [5]
scores_disease = {}

for disease in common_disease_name:
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    scores_disease[disease] = cross_val_score(knn, X_no_age_and_gender[X_no_age_and_gender.columns[1:]].as_matrix(), Y_most_common[disease].as_matrix(), cv=5)

for disease in scores_disease:
    print("Disease: ", disease, "accuracy score: %0.2f (+/- %0.2f)" % (scores_disease[disease].mean(), scores_disease[disease].std() * 2))

Disease:  Dental Care accuracy score: 0.50 (+/- 0.03)
Disease:  Hypertension accuracy score: 0.49 (+/- 0.04)
Disease:  HighCholesterol accuracy score: 0.50 (+/- 0.03)
Disease:  Overweight accuracy score: 0.54 (+/- 0.05)
Disease:  Arthritis accuracy score: 0.59 (+/- 0.03)
Disease:  GenericProblem accuracy score: 0.64 (+/- 0.02)
Disease:  SinusInfection accuracy score: 0.67 (+/- 0.02)


In [17]:
scores_disease = {}

for disease in common_disease_name:
    rfc = RandomForestClassifier(n_estimators=5, max_depth=50, n_jobs=-1)
    scores_disease[disease] = cross_val_score(rfc, X_no_age_and_gender[X_no_age_and_gender.columns[1:]].as_matrix(), Y_most_common[disease].as_matrix(), cv=5)

for disease in scores_disease:
    print("Disease: ", disease, "accuracy score: %0.2f (+/- %0.2f)" % (scores_disease[disease].mean(), scores_disease[disease].std() * 2))

Disease:  Dental Care accuracy score: 0.49 (+/- 0.03)
Disease:  Hypertension accuracy score: 0.49 (+/- 0.03)
Disease:  HighCholesterol accuracy score: 0.51 (+/- 0.06)
Disease:  Overweight accuracy score: 0.55 (+/- 0.03)
Disease:  Arthritis accuracy score: 0.59 (+/- 0.05)
Disease:  GenericProblem accuracy score: 0.63 (+/- 0.03)
Disease:  SinusInfection accuracy score: 0.65 (+/- 0.02)
