In [93]:
#Step 1: Import all necessary libraries
import pandas as pd
import numpy as np
import zipfile 
from prince import FAMD, MCA
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.utils import resample
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [94]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [95]:
#Step 2: Read in DataFrame
# Read in DataFrame
zf = zipfile.ZipFile('../ny.csv.zip') 
zf.namelist() 
df = pd.read_csv(zf.open('ny.csv'),  encoding = 'cp1252')
#df = pd.read_csv('ny.csv', encoding = 'cp1252')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
# View first 5 rows of the data

df.head(5)

In [96]:
#Step 3: Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [97]:
# Step 4: Replace other values in the target variable like 'Don't know/not sure' and 'Refused' 
df_clean['CVDCRHD4'].replace('Don’t know/Not sure', np.nan, inplace=True)
df_clean['CVDCRHD4'].replace('Refused', np.nan, inplace=True)
df_clean.dropna(subset=['CVDCRHD4'], inplace=True)

In [98]:
# Step 5: Encode categorical variables as numeric to calculate correlations
df_clean_categorical = df_clean.copy()
cols = list(df_clean_categorical.columns)
for col in cols:
    if str(df_clean_categorical[col].dtype) == 'object':
        df_clean_categorical[col] = df_clean_categorical[col].astype('category').cat.codes

df_clean.head()

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,CELLSEX,SEXVAR,GENHLTH,...,WTRSOURCE,STRSMEAL,FRUITVEG,MJUSE30,MJNYSMMP,HEPCTEST,HEPCTOLD,_WT2SPLITS,REGION,DSRIPREG
0,New York,1152020,1,15,2020,2020002528,2020002528,,Male,Very good,...,Public Water Supply,Always,,,No,No,No,5130.843243,NYS exclusive of NYC,Long Island
1,New York,1302020,1,30,2020,2020002529,2020002529,,Male,Very good,...,Public Water Supply,Never,,,No,No,No,941.550458,NYS exclusive of NYC,Long Island
2,New York,1152020,1,15,2020,2020002530,2020002530,,Female,Good,...,Public Water Supply,Never,,,No,No,No,3497.366203,NYS exclusive of NYC,Long Island
3,New York,2032020,2,3,2020,2020004509,2020004509,,Female,Good,...,Public Water Supply,Never,Neighborhood,,No,,,1187.70903,NYS exclusive of NYC,Long Island
4,New York,2152020,2,15,2020,2020002531,2020002531,,Male,Fair,...,Don't Know/Not Sure,Rarely,,,No,No,No,13364.387863,NYS exclusive of NYC,Long Island


In [99]:
#Step 7: Create correlation matrix to find which features to use for mca
df_clean_corr = df_clean_categorical.corrwith(df_clean_categorical["CVDCRHD4"])
df_clean_corr_abs = df_clean_corr.abs()
df_clean_corr_abs.sort_values(inplace=True, ascending=False)
df_clean_corr_abs

CVDCRHD4    1.000000
_MICHD      0.772506
CVDINFR4    0.444813
_AGE80      0.186988
_RFHLTH     0.186002
              ...   
_MENT14D    0.003757
HEIGHT3     0.003444
HISPANC3    0.001667
IDAY        0.000190
_STATE           NaN
Length: 164, dtype: float64

In [124]:
#Only use the top 30 columns, excluding columns like _MICHD, CVDINFR4, which ask about heart attack and coronary disease which is very close to heart disease.
feature_list = list(df_clean_corr_abs[0:100].keys())
feature_list.remove('CVDINFR4')
feature_list.remove('_MICHD')
feature_list

['CVDCRHD4',
 '_AGE80',
 '_RFHLTH',
 '_PNEUMO3',
 '_FLSHOT7',
 '_IMPAGE',
 '_AGE_G',
 '_AGEG5YR',
 'DIFFWALK',
 'DIABETE4',
 'CHCCOPD2',
 '_DRDXAR2',
 'HAVARTH4',
 'MEDICARE',
 'SHINGLE2',
 'AGE',
 'BLDSTOL1',
 'FALL12MN',
 'CVDSTRK3',
 '_ALTETH3',
 'EMPLOY1',
 'STOOLDNA',
 'PNEUVAC4',
 'COLNSCPY',
 'SIGMSCPY',
 '_AGE65YR',
 'VIRCOLON',
 'FALLINJ4',
 'CHCKDNY2',
 'HLTHCVR1',
 'BPHIGH4',
 'VETERAN3',
 '_EXTETH3',
 '_RFPSA23',
 '_HCVU651',
 'PHYSHLTH',
 'SMOKE100',
 'DEAF',
 'DIFFDRES',
 '_PHYS14D',
 'CHCSCNCR',
 'DIFFALON',
 '_STSTR',
 'BIRTHSEX',
 'QSTVER',
 'RMVTETH4',
 '_IMPSEX',
 'SEXVAR',
 '_SEX',
 'FLUSHOT7',
 '_STOLDNA',
 'BLIND',
 '_ASTHMS1',
 '_LTASTH1',
 'ASTHMA3',
 'CNCRAGE',
 '_RFBLDS4',
 'MSCODE',
 '_CASTHM1',
 'CHECKUP1',
 'EXERANY2',
 '_WT2SPLITS',
 '_TOTINDA',
 'WTKG3',
 'CHCOCNCR',
 '_RFPAP35',
 '_RFBMI5',
 '_LLCPWT',
 'SOFEMALE',
 '_CRCREC1',
 '_VIRCOLN',
 'DECIDE',
 '_SGMSCPY',
 '_SGMS10Y',
 'USEPNMED',
 '_CLNSCPY',
 '_SBONTIM',
 'MARITAL',
 'CELLSEX',
 'LASTDEN4',
 '

In [126]:
df_clean_columns = df_clean[feature_list]
df_clean_columns.head()

Unnamed: 0,CVDCRHD4,_AGE80,_RFHLTH,_PNEUMO3,_FLSHOT7,_IMPAGE,_AGE_G,_AGEG5YR,DIFFWALK,DIABETE4,...,_RACEPRV,_IMPMRTL,_IMPRACE,DRNK3GE5,_BMI5CAT,_BMI5,ADDEPEV3,_DRNKDRV,WEIGHT2,_AIDTST4
0,No,Imputed Age 65 to 69,Good or Better Health,No,Yes,Age 65 or older,Age 65 or older,Age 65 to 69,No,No,...,"Asian only, non-Hispanic",Married,"Asian, Non-Hispanic",,Obese,1 or greater,No,Have not driven after having too much to drink,Weight (pounds),No
1,No,Imputed Age 55 to 59,Good or Better Health,Age Less Than 65,Age Less Than 65,Age 55 to 64,Age 55 to 64,Age 55 to 59,No,No,...,"White only, non-Hispanic",Married,"White, Non-Hispanic",,Obese,1 or greater,No,Have not driven after having too much to drink,Weight (pounds),Yes
2,No,Imputed Age 80 or older,Good or Better Health,Yes,Yes,Age 65 or older,Age 65 or older,Age 80 or older,Yes,Yes,...,"White only, non-Hispanic",Widowed,"White, Non-Hispanic",,Normal Weight,1 or greater,No,Don’t know/Not Sure/Refused/Missing,Weight (pounds),No
3,No,Imputed Age 80 or older,Good or Better Health,Yes,Yes,Age 65 or older,Age 65 or older,Age 80 or older,Yes,No,...,"White only, non-Hispanic",Widowed,"White, Non-Hispanic",,Overweight,1 or greater,Yes,Have not driven after having too much to drink,Weight (pounds),No
4,No,Imputed Age 40 to 44,Fair or Poor Health,Age Less Than 65,Age Less Than 65,Age 35 to 44,Age 35 to 44,Age 40 to 44,No,Yes,...,Hispanic,Married,Hispanic,,Overweight,1 or greater,Yes,Don’t know/Not Sure/Refused/Missing,Weight (pounds),Yes


In [127]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 'No') | (df_cleaned['CVDCRHD4'] == 'Yes')]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

In [128]:
#create two different dataframe of majority and minority class 
training_data = pd.DataFrame(X_train)
training_data['CVDCRHD4'] = y_train
df_majority = training_data[(training_data['CVDCRHD4']=='No')] 
df_minority = training_data[(training_data['CVDCRHD4']=='Yes')] 
# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples= len(df_majority), # to match majority class
                                 random_state=42)  # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority_upsampled, df_majority])
X_train_upsampled = df_upsampled.loc[:, df_cleaned.columns != 'CVDCRHD4']
y_train_upsampled = df_upsampled['CVDCRHD4']

In [129]:
'''
famd = FAMD(n_components = 120, n_iter = 3, random_state = 42)
X_train_transformed = famd.fit_transform(X_train_upsampled)
X_test_transformed = famd.fit_transform(X_test)
X_train_transformed.head()
'''
mca = MCA( n_components=70, n_iter=3, copy=True, check_input=True,engine='auto', random_state=42)
mca = mca.fit(X_train)
X_train_transformed = mca.fit_transform(X_train_upsampled)
X_test_transformed = mca.fit_transform(X_test)
mca.explained_inertia_


[1.1742093853440672e-07,
 5.48664785729208e-08,
 6.845666235227811e-09,
 1.96745736722763e-09,
 1.0548710451065176e-09,
 9.521555974146133e-10,
 8.395453672272274e-10,
 7.60951023923641e-10,
 7.110171326559286e-10,
 6.488504039132843e-10,
 5.836989773919907e-10,
 5.604205618534228e-10,
 5.442676758234076e-10,
 5.09568171211595e-10,
 4.826655419949682e-10,
 4.718881573765984e-10,
 4.446622326528051e-10,
 4.324347172068845e-10,
 4.1289883904034537e-10,
 4.0798275044069166e-10,
 3.997668346756474e-10,
 3.681399461728019e-10,
 3.567574449846628e-10,
 3.35366460810851e-10,
 3.337569324634916e-10,
 3.211703059697415e-10,
 3.112273473324759e-10,
 3.001199834307002e-10,
 2.9379278951939855e-10,
 2.8005877863534716e-10,
 2.7450529361505636e-10,
 2.6655150578787693e-10,
 2.514151495415362e-10,
 2.4320716745923895e-10,
 2.3850139023080344e-10,
 2.3443209786893555e-10,
 2.314333520175221e-10,
 2.160267309877131e-10,
 2.0607138173190927e-10,
 2.0443238163361033e-10,
 2.0001926752123168e-10,
 1.9548

In [105]:
#helper function
def get_performance_scores(y_pred, y_true):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return [f1, accuracy, precision, recall]

def print_performance_scores(scores):
    print("Accuracy Score = " + str(scores[1]))
    print("Precision Score = " + str(scores[2]))
    print("Recall Score = " + str(scores[3]))
    print("F1 Score = " + str(scores[0]))

In [None]:
RANDOM_SEED = 694
#Run dummy classifier to find the baseline performance
dummy_clf = DummyClassifier(strategy= 'uniform').fit(X_train_transformed,y_train_upsampled)
y_pred = dummy_clf.predict(X_test_transformed)
print_performance_scores(get_performance_scores(y_pred, y_test))

In [106]:
clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train_transformed, y_train_upsampled)
train_preds = clf_lr.predict(X_test_transformed)
print_performance_scores(get_performance_scores(train_preds, y_test))

Accuracy Score = 0.4641873278236915
Precision Score = 0.466440253481577
Recall Score = 0.39848484848484844
F1 Score = 0.35822438762779774


In [None]:
#Do grid search for hyperparameter tuning
clf = LogisticRegression(random_state = RANDOM_SEED)
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25], 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1_macro')
my_list = list(range(100))
for x in tqdm(my_list):
    grid_clf_acc.fit(X_train_transformed, y_train_upsampled)

y_pred_acc = grid_clf_acc.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred_acc, y_test))

In [None]:
print(grid_clf_acc.best_estimator_)
print(grid_clf_acc.best_params_)
print(grid_clf_acc.scorer_)
print(grid_clf_acc.best_score_)
print(grid_clf_acc.cv_results_['params'][grid_clf_acc.best_index_])

In [115]:
#LogisticRegression(C=25, penalty='l1', random_state=694, solver='liblinear')
w = {'No' : 0.5, 'Yes' : 0.5}
clf_lr = LogisticRegression(penalty = 'l2', random_state = RANDOM_SEED, class_weight = w, solver = 'liblinear', C = 0.75).fit(X_train_transformed, y_train_upsampled)
train_preds = clf_lr.predict(X_test_transformed)
print_performance_scores(get_performance_scores(train_preds, y_test))

w = {'No' : 0.04, 'Yes' : 0.96}
clf_lr = LogisticRegression(penalty = 'l2', random_state = RANDOM_SEED, class_weight = w, solver = 'liblinear', C = 0.75).fit(X_train_transformed, y_train_upsampled)
#X_train_transformed = famd.fit_transform(X_train)
#X_test_transformed = famd.fit_transform(X_test)
train_preds = (clf_lr.predict_proba(X_test_transformed)[:,1] >= 0.85).astype(int)
train_preds = pd.DataFrame(train_preds, columns = ['val'])
train_preds = train_preds['val'].replace(to_replace = [0, 1], value = ['No', 'Yes'])
print_performance_scores(get_performance_scores(train_preds, y_test))

Accuracy Score = 0.4889807162534435
Precision Score = 0.5298452553401853
Recall Score = 0.5893939393939394
F1 Score = 0.4131320064058568
Accuracy Score = 0.09090909090909091
Precision Score = 0.045454545454545456
Recall Score = 0.5
F1 Score = 0.08333333333333334


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = RandomizedSearchCV(SVC(random_state=RANDOM_SEED),param_grid,refit=True,verbose=2, scoring='f1_macro')

my_list = list(range(100))
for x in tqdm(my_list):
    grid.fit(X_train_transformed[0:10000],y_train_upsampled[0:10000])

print(grid.best_estimator_)
'''
svm = SVC(C=100, gamma=1, kernel='rbf').fit(X_train_transformed,y_train_upsampled)

y_pred_acc = svm.predict(X_test_transformed)
'''
#print_performance_scores(get_performance_scores(y_pred_acc, y_test))

In [108]:
svc_model = SVC(C=100, gamma=0.1, kernel='rbf', random_state=694)
svc_model.fit(X_train_transformed, y_train_upsampled)

y_pred = svc_model.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred, y_test))

Accuracy Score = 0.5550964187327824
Precision Score = 0.4692613203957382
Recall Score = 0.4109848484848485
F1 Score = 0.39669970053409903


In [1]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_SEED )
random_forest.fit(X_train_transformed, y_train_upsampled)

y_pred = random_forest.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred, y_test))

NameError: name 'RandomForestClassifier' is not defined

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier().fit(X_train_transformed, y_train_upsampled)

grid_values = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
 }

rand_search_clf = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_values, n_iter = 10, cv = 3, verbose=2, scoring='f1_macro', random_state=RANDOM_SEED, n_jobs = -1)

my_list = list(range(100))
for x in tqdm(my_list):
    rand_search_clf.fit(X_train_transformed[0:10000], y_train_upsampled[0:10000])
print(rand_search_clf.best_estimator_)

y_pred_rf = rand_search_clf.predict(X_test_transformed)
   


#print_performance_scores(get_performance_scores(y_pred_rf, y_test))

In [None]:
print(rand_search_clf.best_estimator_)
print(rand_search_clf.best_params_)
print(rand_search_clf.scorer_)
print(rand_search_clf.best_score_)
print(rand_search_clf.cv_results_['params'][rand_search_clf.best_index_])
cv_results = pd.DataFrame.from_dict(rand_search_clf.cv_results_)
cv_results

In [2]:
random_forest = RandomForestClassifier(n_estimators= 200, min_samples_split= 2, min_samples_leaf = 1, max_depth = None, bootstrap = False, random_state = RANDOM_SEED )
random_forest.fit(X_train_transformed, y_train_upsampled)

y_pred = random_forest.predict(X_test_transformed)
    
print_performance_scores(get_performance_scores(y_pred, y_test))

NameError: name 'RandomForestClassifier' is not defined

In [123]:
from imblearn.over_sampling import SMOTENC

cat_features = [i for i in range(0,48)]
os = SMOTE(random_state = RANDOM_SEED)
os_data_X , os_data_y = os.fit_resample(X_train, y_train)
famd = FAMD(n_components = 107, n_iter = 3, random_state = 42)
X_train_os = famd.fit_transform(os_data_X)
clf_lr = LogisticRegression(random_state = RANDOM_SEED, solver = 'liblinear').fit(X_train_os, os_data_y.values.ravel())
X_test_transformed = famd.fit_transform(X_test)
train_preds = (clf_lr.predict_proba(X_test_transformed)[:,1] >= 0.8).astype(int)
train_preds = pd.DataFrame(train_preds, columns = ['val'])
train_preds = train_preds['val'].replace(to_replace = [0, 1], value = ['No', 'Yes'])

lr_f1 = f1_score(y_test, train_preds, average='macro')
lr_accuracy = accuracy_score(y_test, train_preds)
lr_precision = precision_score(y_test, train_preds, average='macro')
lr_recall = recall_score(y_test, train_preds, average='macro')
print("Accuracy Score = " + str(lr_accuracy))
print("Precision Score = " + str(lr_precision))
print("Recall Score = " + str(lr_recall))
print("F1 Score = " + str(lr_f1))

ValueError: could not convert string to float: 'Imputed Age 55 to 59'