In [1]:
#Step 1: Import all necessary libraries
import pandas as pd
import numpy as np
from prince import FAMD, MCA
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.utils import resample
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [3]:
#Step 2: Read in DataFrame
my_list = list(range(100))
for x in tqdm(my_list):
    df = pd.read_csv('../../NYSDOH_BRFSS_SurveyData_2020.csv', encoding = 'cp1252')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# View first 5 rows of the data

df.head(5)

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,COLGSEX,LANDSEX,RESPSLCT,...,PPS_40,PPS_43,PPS_44,PPS_45,PPS_46,PPS_48,PPS_52,SEXNOCON_OTH,NOVEGFRU_OTH,CHILDREN
0,New York,1152020,1,15,2020,2020002528,2020002528,Not asked or Missing,Not asked or Missing,Male,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
1,New York,1302020,1,30,2020,2020002529,2020002529,Not asked or Missing,Not asked or Missing,Male,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
2,New York,1152020,1,15,2020,2020002530,2020002530,Not asked or Missing,Female,Not asked or Missing,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
3,New York,2032020,2,3,2020,2020004509,2020004509,Not asked or Missing,Not asked or Missing,Female,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
4,New York,2152020,2,15,2020,2020002531,2020002531,Not asked or Missing,Not asked or Missing,Female,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing


In [5]:
#Step 3: Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [6]:
# Step 4: Replace other values in the target variable like 'Don't know/not sure' and 'Refused' 
df_clean['CVDCRHD4'].replace('Don’t know/Not sure', np.nan, inplace=True)
df_clean['CVDCRHD4'].replace('Refused', np.nan, inplace=True)
df_clean.dropna(subset=['CVDCRHD4'], inplace=True)

In [7]:
# Step 5: Encode categorical variables as numeric to calculate correlations
df_clean_categorical = df_clean.copy()
cols = list(df_clean_categorical.columns)
for col in cols:
    if str(df_clean_categorical[col].dtype) == 'object':
        df_clean_categorical[col] = df_clean_categorical[col].astype('category').cat.codes

df_clean.head()

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,CELLSEX,SEXVAR,GENHLTH,...,WTRSOURCE,STRSMEAL,FRUITVEG,MJUSE30,MJNYSMMP,HEPCTEST,HEPCTOLD,_WT2SPLITS,REGION,DSRIPREG
0,New York,1152020,1,15,2020,2020002528,2020002528,,Male,Very good,...,Public Water Supply,Always,,,No,No,No,5130.843243,NYS exclusive of NYC,Long Island
1,New York,1302020,1,30,2020,2020002529,2020002529,,Male,Very good,...,Public Water Supply,Never,,,No,No,No,941.550458,NYS exclusive of NYC,Long Island
2,New York,1152020,1,15,2020,2020002530,2020002530,,Female,Good,...,Public Water Supply,Never,,,No,No,No,3497.366203,NYS exclusive of NYC,Long Island
3,New York,2032020,2,3,2020,2020004509,2020004509,,Female,Good,...,Public Water Supply,Never,Neighborhood,,No,,,1187.70903,NYS exclusive of NYC,Long Island
4,New York,2152020,2,15,2020,2020002531,2020002531,,Male,Fair,...,Don't Know/Not Sure,Rarely,,,No,No,No,13364.387863,NYS exclusive of NYC,Long Island


In [8]:
#Step 7: Create correlation matrix to find which features to use for mca
df_clean_corr = df_clean_categorical.corrwith(df_clean_categorical["CVDCRHD4"])
df_clean_corr_abs = df_clean_corr.abs()
df_clean_corr_abs.sort_values(inplace=True, ascending=False)
df_clean_corr_abs

CVDCRHD4    1.000000
_MICHD      0.772506
CVDINFR4    0.444813
_AGE80      0.186988
_RFHLTH     0.186002
              ...   
_MENT14D    0.003757
HEIGHT3     0.003444
HISPANC3    0.001667
IDAY        0.000190
_STATE           NaN
Length: 164, dtype: float64

In [9]:
#Only use the top 30 columns, excluding columns like _MICHD, CVDINFR4, which ask about heart attack and coronary disease which is very close to heart disease.
feature_list = list(df_clean_corr_abs[0:50].keys())
feature_list.remove('CVDINFR4')
feature_list.remove('_MICHD')
feature_list

['CVDCRHD4',
 '_AGE80',
 '_RFHLTH',
 '_PNEUMO3',
 '_FLSHOT7',
 '_IMPAGE',
 '_AGE_G',
 '_AGEG5YR',
 'DIFFWALK',
 'DIABETE4',
 'CHCCOPD2',
 '_DRDXAR2',
 'HAVARTH4',
 'MEDICARE',
 'SHINGLE2',
 'AGE',
 'BLDSTOL1',
 'FALL12MN',
 'CVDSTRK3',
 '_ALTETH3',
 'EMPLOY1',
 'STOOLDNA',
 'PNEUVAC4',
 'COLNSCPY',
 'SIGMSCPY',
 '_AGE65YR',
 'VIRCOLON',
 'FALLINJ4',
 'CHCKDNY2',
 'HLTHCVR1',
 'BPHIGH4',
 'VETERAN3',
 '_EXTETH3',
 '_RFPSA23',
 '_HCVU651',
 'PHYSHLTH',
 'SMOKE100',
 'DEAF',
 'DIFFDRES',
 '_PHYS14D',
 'CHCSCNCR',
 'DIFFALON',
 '_STSTR',
 'BIRTHSEX',
 'QSTVER',
 'RMVTETH4',
 '_IMPSEX',
 'SEXVAR']

In [10]:
df_clean_columns = df_clean[feature_list]
df_clean_columns.head()

Unnamed: 0,CVDCRHD4,_AGE80,_RFHLTH,_PNEUMO3,_FLSHOT7,_IMPAGE,_AGE_G,_AGEG5YR,DIFFWALK,DIABETE4,...,DIFFDRES,_PHYS14D,CHCSCNCR,DIFFALON,_STSTR,BIRTHSEX,QSTVER,RMVTETH4,_IMPSEX,SEXVAR
0,No,Imputed Age 65 to 69,Good or Better Health,No,Yes,Age 65 or older,Age 65 or older,Age 65 to 69,No,No,...,No,Zero days when physical health not good,No,No,361011,Male,12,1 to 5,Male,Male
1,No,Imputed Age 55 to 59,Good or Better Health,Age Less Than 65,Age Less Than 65,Age 55 to 64,Age 55 to 64,Age 55 to 59,No,No,...,No,Zero days when physical health not good,No,No,361011,Male,12,,Male,Male
2,No,Imputed Age 80 or older,Good or Better Health,Yes,Yes,Age 65 or older,Age 65 or older,Age 80 or older,Yes,Yes,...,No,Zero days when physical health not good,No,Yes,361012,Female,12,1 to 5,Female,Female
3,No,Imputed Age 80 or older,Good or Better Health,Yes,Yes,Age 65 or older,Age 65 or older,Age 80 or older,Yes,No,...,No,Zero days when physical health not good,No,No,361011,Female,13,,Female,Female
4,No,Imputed Age 40 to 44,Fair or Poor Health,Age Less Than 65,Age Less Than 65,Age 35 to 44,Age 35 to 44,Age 40 to 44,No,Yes,...,No,1-13 days when physical health not good,Yes,Yes,361012,Male,12,,Male,Male


In [11]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)
print(df_cleaned['CVDCRHD4'])

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 'No') | (df_cleaned['CVDCRHD4'] == 'Yes')]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

0       No
1       No
2       No
3       No
4       No
        ..
4833    No
4834    No
4835    No
4836    No
4837    No
Name: CVDCRHD4, Length: 4838, dtype: object


In [12]:
#create two different dataframe of majority and minority class 
training_data = pd.DataFrame(X_train)
training_data['CVDCRHD4'] = y_train
df_majority = training_data[(training_data['CVDCRHD4']=='No')] 
df_minority = training_data[(training_data['CVDCRHD4']=='Yes')] 
# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples= len(df_majority), # to match majority class
                                 random_state=42)  # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority_upsampled, df_majority])
X_train_upsampled = df_upsampled.loc[:, df_cleaned.columns != 'CVDCRHD4']
y_train_upsampled = df_upsampled['CVDCRHD4']

In [13]:
'''famd = FAMD(n_components = 70, n_iter = 3, random_state = 42)
X_train_transformed = famd.fit_transform(X_train)
X_train_transformed.head()'''

mca = MCA( n_components=2, n_iter=3, copy=True, check_input=True,engine='auto', random_state=42)
mca = mca.fit(X_train)
X_train_transformed = mca.fit_transform(X_train_upsampled)
X_test_transformed = mca.fit_transform(X_test)

In [14]:
#helper function
def get_performance_scores(y_pred, y_true):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return [f1, accuracy, precision, recall]

def print_performance_scores(scores):
    print("Accuracy Score = " + str(scores[1]))
    print("Precision Score = " + str(scores[2]))
    print("Recall Score = " + str(scores[3]))
    print("F1 Score = " + str(scores[0]))

In [15]:
RANDOM_SEED = 694
#Run dummy classifier to find the baseline performance
dummy_clf = DummyClassifier(strategy= 'most_frequent').fit(X_train_transformed,y_train_upsampled)
y_pred = dummy_clf.predict(X_test_transformed)
print_performance_scores(get_performance_scores(y_pred, y_test))

Accuracy Score = 0.9090909090909091
Precision Score = 0.45454545454545453
Recall Score = 0.5
F1 Score = 0.47619047619047616


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train_transformed, y_train_upsampled)
train_preds = clf_lr.predict(X_test_transformed)
print_performance_scores(get_performance_scores(train_preds, y_test))

Accuracy Score = 0.4889807162534435
Precision Score = 0.5298452553401853
Recall Score = 0.5893939393939394
F1 Score = 0.4131320064058568


In [33]:
#Do grid search for hyperparameter tuning
clf = LogisticRegression(random_state = RANDOM_SEED, solver='liblinear')
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1_macro')
my_list = list(range(100))
for x in tqdm(my_list):
    grid_clf_acc.fit(X_train_transformed, y_train_upsampled)

print(grid_clf_acc.best_estimator_)
print(grid_clf_acc.best_params_)
print(grid_clf_acc.best_score_)
y_pred_acc = grid_clf_acc.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred_acc, y_test))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

LogisticRegression(C=25, penalty='l1', random_state=694, solver='liblinear')
{'C': 25, 'penalty': 'l1'}
0.6608194027753316
Accuracy Score = 0.49793388429752067
Precision Score = 0.5302389911172964
Recall Score = 0.5909090909090909
F1 Score = 0.4183814257102589


In [40]:
clf_lr = LogisticRegression(random_state = RANDOM_SEED, C=25, penalty='l1', solver='liblinear', class_weight="balanced").fit(X_train_transformed, y_train_upsampled)
train_preds = clf_lr.predict(X_test_transformed)
print_performance_scores(get_performance_scores(train_preds, y_test))

Accuracy Score = 0.49793388429752067
Precision Score = 0.5302389911172964
Recall Score = 0.5909090909090909
F1 Score = 0.4183814257102589


In [31]:

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(random_state=RANDOM_SEED),param_grid,refit=True,verbose=2, scoring='f1_micro')

my_list = list(range(100))
for x in tqdm(my_list):
    grid.fit(X_train_transformed,y_train_upsampled)

print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)
'''
svm = SVC(C=100, gamma=1, kernel='rbf').fit(X_train_transformed,y_train_upsampled)

y_pred_acc = svm.predict(X_test_transformed)
'''
#print_performance_scores(get_performance_scores(y_pred_acc, y_test))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.1s


KeyboardInterrupt: 

In [30]:
svm = SVC(C=100, gamma=1, kernel='rbf').fit(X_train_transformed,y_train_upsampled)

y_pred_acc = svm.predict(X_test_transformed)
print_performance_scores(get_performance_scores(y_pred_acc, y_test))

Accuracy Score = 0.48829201101928377
Precision Score = 0.5297270516169714
Recall Score = 0.5890151515151515
F1 Score = 0.412674761264177


In [23]:
print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)

SVC(C=100, gamma=1, random_state=694)
{'C': 100, 'gamma': 1, 'kernel': 'rbf'}
0.622967366203974


In [None]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_SEED )
random_forest.fit(X_train_transformed, y_train_upsampled)

y_pred = random_forest.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred, y_test))

Accuracy Score = 0.7878787878787878
Precision Score = 0.5702094828392407
Recall Score = 0.6378787878787878
F1 Score = 0.577989823803844


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier().fit(X_train_transformed, y_train_upsampled)

grid_values = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
 }

rand_search_clf = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_values, n_iter = 10, cv = 3, verbose=2, random_state=RANDOM_SEED, n_jobs = -1)

my_list = list(range(100))
for x in tqdm(my_list):
    rand_search_clf.fit(X_train_transformed[0:10000], y_train_upsampled[0:10000])


y_pred_rf = rand_search_clf.predict(X_test_transformed)
   


print_performance_scores(get_performance_scores(y_pred_rf, y_test))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi