In [18]:
#Step 1: Import all necessary libraries
import pandas as pd
import numpy as np
from prince import FAMD, MCA
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [19]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [20]:
#Step 2: Read in DataFrame
my_list = list(range(100))
for x in tqdm(my_list):
    df = pd.read_csv('../NYSDOH_BRFSS_SurveyData_2020.csv', encoding = 'cp1252')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
# View first 5 rows of the data

df.head(5)

In [22]:
#Step 3: Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [23]:
# Step 4: Replace other values in the target variable like 'Don't know/not sure' and 'Refused' 
df_clean['CVDCRHD4'].replace('Don’t know/Not sure', np.nan, inplace=True)
df_clean['CVDCRHD4'].replace('Refused', np.nan, inplace=True)
df_clean.dropna(subset=['CVDCRHD4'], inplace=True)

In [24]:
# Step 5: Encode categorical variables as numeric to do SMOTE and calculate correlations
df_clean_categorical = df_clean.copy()
cols = list(df_clean_categorical.columns)
for col in cols:
    if str(df_clean_categorical[col].dtype) == 'object':
        df_clean_categorical[col] = df_clean_categorical[col].astype('category').cat.codes

df_clean.head()

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,CELLSEX,SEXVAR,GENHLTH,...,WTRSOURCE,STRSMEAL,FRUITVEG,MJUSE30,MJNYSMMP,HEPCTEST,HEPCTOLD,_WT2SPLITS,REGION,DSRIPREG
0,New York,1152020,1,15,2020,2020002528,2020002528,,Male,Very good,...,Public Water Supply,Always,,,No,No,No,5130.843243,NYS exclusive of NYC,Long Island
1,New York,1302020,1,30,2020,2020002529,2020002529,,Male,Very good,...,Public Water Supply,Never,,,No,No,No,941.550458,NYS exclusive of NYC,Long Island
2,New York,1152020,1,15,2020,2020002530,2020002530,,Female,Good,...,Public Water Supply,Never,,,No,No,No,3497.366203,NYS exclusive of NYC,Long Island
3,New York,2032020,2,3,2020,2020004509,2020004509,,Female,Good,...,Public Water Supply,Never,Neighborhood,,No,,,1187.70903,NYS exclusive of NYC,Long Island
4,New York,2152020,2,15,2020,2020002531,2020002531,,Male,Fair,...,Don't Know/Not Sure,Rarely,,,No,No,No,13364.387863,NYS exclusive of NYC,Long Island


In [25]:
#Step 6: Apply SMOTE to increase the number of row with people responding 'No' to the target variable
df_clean_categorical_nonans = df_clean_categorical.apply(lambda x:x.fillna(x.value_counts().index[0]))
X = df_clean_categorical_nonans.loc[:, df_clean.columns != 'CVDCRHD4']
y = df_clean_categorical_nonans['CVDCRHD4']
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [26]:
print(Counter(y))
df_smote = pd.DataFrame(X)
df_smote['CVDCRHD4'] = y
df_smote.head()

Counter({0: 13950, 1: 13950})


Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,CELLSEX,SEXVAR,GENHLTH,...,STRSMEAL,FRUITVEG,MJUSE30,MJNYSMMP,HEPCTEST,HEPCTOLD,_WT2SPLITS,REGION,DSRIPREG,CVDCRHD4
0,0,1152020,1,15,2020,2020002528,2020002528,-1,1,6,...,0,-1,1,1,1,1,5130.843243,0,3,0
1,0,1302020,1,30,2020,2020002529,2020002529,-1,1,6,...,2,-1,1,1,1,1,941.550458,0,3,0
2,0,1152020,1,15,2020,2020002530,2020002530,-1,0,3,...,2,-1,1,1,1,1,3497.366203,0,3,0
3,0,2032020,2,3,2020,2020004509,2020004509,-1,0,3,...,2,1,1,1,-1,-1,1187.70903,0,3,0
4,0,2152020,2,15,2020,2020002531,2020002531,-1,1,2,...,4,-1,1,1,1,1,13364.387863,0,3,0


In [27]:
#Step 7: Create correlation matrix to find which features to use for mca
df_clean_corr = df_smote.corrwith(df_smote["CVDCRHD4"])
df_clean_corr_abs = df_clean_corr.abs()
df_clean_corr_abs.sort_values(inplace=True, ascending=False)
df_clean_corr_abs

CVDCRHD4    1.000000
_MICHD      0.969759
_RFHLTH     0.533738
_DRDXAR2    0.467417
CVDINFR4    0.446192
              ...   
MARITAL     0.006768
_RFPSA23    0.005579
HISPANC3    0.004975
_HISPANC    0.001590
_STATE           NaN
Length: 164, dtype: float64

In [28]:
#Only use the top 30 columns, excluding columns like _MICHD, CVDINFR4, which ask about heart attack and coronary disease which is very close to heart disease.
feature_list = list(df_clean_corr_abs[0:35].keys())
feature_list.remove('CVDINFR4')
feature_list.remove('_MICHD')
feature_list

['CVDCRHD4',
 '_RFHLTH',
 '_DRDXAR2',
 '_AGE80',
 '_HCVU651',
 '_EXTETH3',
 '_AGEG5YR',
 '_AGE_G',
 '_IMPAGE',
 '_STOLDNA',
 'STOOLDNA',
 'AGE',
 '_RFBLDS4',
 'COLNSCPY',
 'BLDSTOL1',
 'SHINGLE2',
 '_PHYS14D',
 'VIRCOLON',
 'REGION',
 '_SGMS10Y',
 '_PNEUMO3',
 'MEDICARE',
 '_RACEG21',
 '_SGMSCPY',
 'EMPLOY1',
 'RMVTETH4',
 '_FLSHOT7',
 '_SBONTIM',
 '_CRCREC1',
 'SIGMSCPY',
 '_IMPHOME',
 '_DENVST3',
 '_MAM5023']

In [29]:
df_clean_columns = df_smote[feature_list]
df_clean_columns.head()

Unnamed: 0,CVDCRHD4,_RFHLTH,_DRDXAR2,_AGE80,_HCVU651,_EXTETH3,_AGEG5YR,_AGE_G,_IMPAGE,_STOLDNA,...,_SGMSCPY,EMPLOY1,RMVTETH4,_FLSHOT7,_SBONTIM,_CRCREC1,SIGMSCPY,_IMPHOME,_DENVST3,_MAM5023
0,0,2,2,9,1,0,9,5,5,2,...,2,2,0,3,3,1,1,1,2,1
1,0,2,2,7,2,2,7,4,4,2,...,2,2,4,0,3,1,1,1,2,1
2,0,2,0,12,1,0,12,5,5,3,...,3,6,0,3,2,3,1,1,2,1
3,0,2,0,12,1,2,12,5,5,3,...,3,6,4,3,2,3,1,1,2,1
4,0,1,2,4,1,2,4,2,2,3,...,3,7,4,0,2,3,-1,2,2,1


In [31]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)
print(df_cleaned['CVDCRHD4'])

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 0) | (df_cleaned['CVDCRHD4'] == 1)]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

0        0
1        0
2        0
3        0
4        0
        ..
27895    1
27896    1
27897    1
27898    1
27899    1
Name: CVDCRHD4, Length: 27900, dtype: int8


In [32]:
'''famd = FAMD(n_components = 70, n_iter = 3, random_state = 42)
X_train_transformed = famd.fit_transform(X_train)
X_train_transformed.head()'''

mca = MCA( n_components=2, n_iter=3, copy=True, check_input=True,engine='auto', random_state=42)
mca = mca.fit(X)
X_train_transformed = mca.fit_transform(X_train)
X_test_transformed = mca.fit_transform(X_test)

ValueError: All values in X should be positive

In [16]:
#helper function
def get_performance_scores(y_pred, y_true):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return [f1, accuracy, precision, recall]

def print_performance_scores(scores):
    print("Accuracy Score = " + str(scores[1]))
    print("Precision Score = " + str(scores[2]))
    print("Recall Score = " + str(scores[3]))
    print("F1 Score = " + str(scores[0]))

In [17]:
RANDOM_SEED = 694
#Run dummy classifier to find the baseline performance
dummy_clf = DummyClassifier(strategy= 'most_frequent').fit(X_train_transformed,y_train)
y_pred = dummy_clf.predict(X_test_transformed)

Counter({'No': 4629, 'Yes': 371})


In [None]:
clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train_transformed, y_train)
train_preds = clf_lr.predict(X_test_transformed)
print_performance_scores(get_performance_scores(train_preds, y_test))

In [None]:
#Do grid search for hyperparameter tuning
clf = LogisticRegression(random_state = RANDOM_SEED, solver='liblinear')
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1_macro')
my_list = list(range(100))
for x in tqdm(my_list):
    grid_clf_acc.fit(X_train_transformed, y_train)

y_pred_acc = grid_clf_acc.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred_acc, y_test))

In [None]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_SEED )
random_forest.fit(X_train_transformed, y_train)

y_pred = random_forest.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train_transformed, y_train)

grid_values = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
 }

rand_search_clf = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_values, n_iter = 1, cv = 3, verbose=2, random_state=RANDOM_SEED, n_jobs = -1)

my_list = list(range(100))
for x in tqdm(my_list):
    rand_search_clf.fit(X_train_transformed[0:10000], y_train[0:10000])


y_pred_rf = rand_search_clf.predict(X_test_transformed)
   


print_performance_scores(get_performance_scores(y_pred_rf, y_test))