In [1]:
import pandas as pd
import numpy as np
from prince import FAMD
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [3]:
# Read in DataFrame
my_list = list(range(100))
for x in tqdm(my_list):
    df = pd.read_csv('../NYSDOH_BRFSS_SurveyData_2020.csv', encoding = 'cp1252')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
# View first 5 rows of the data

df.head(5)

In [4]:
# Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [None]:
null_vals = df_clean.isnull().sum()
null_vals_desc = null_vals.sort_values(ascending=False)
null_vals_desc.to_csv('null_columns.csv')

In [None]:
# View first 5 rows of DataFrame after cleaning out missing values
df_clean.head(5)

In [None]:
# Select relevant columns related to heart disease by utilizing resources detailing factors of heart disease
# shorturl.at/oqwF5 - Behavioral risk factors of coronary artery disease: A paired matched case control study
# shorturl.at/cpAXZ - Strategies to prevent heart disease
# shorturl.at/gpwAR - Top five habits that harm the heart
# shorturl.at/mtJUZ - 9 Common Habits That Are Bad for Your Heart

list(df_clean.columns)

In [None]:
# Characteristics
# 1. SEXVAR - Sex - (Male or Female)
# 2. _IMPAGE - Age - (Age 65 or older, Age 55 - 64, Age 45 - 54, Age 35 - 44, Age 25 - 34, Age 18 - 24)
# 3. _IMPRACE - Race - (White, Non-Hispanic, Hispanic, Black, Non-Hispanic, Other race, Non-Hispanic, Asian, Non-Hispanic,
# American Indian/Alaskan Native, Non-Hispanic)
# 4. VETERAN3 - Former veteran status - (Yes, No, Refused, Don't know/Not sure)
# 5. WTKG3 - Weight in KG - (Continous value)
# 6. _IMPMRTL - Marital status - (Married, Never Married, Divorced, Widowed, A member of an unmarried couple, 
# Separated)
# 7. _RFBMI5 - Overweight or Obese - (Yes, No, Don’t know/Refused/Missing)


# Health 
# 8. HLTHPLN1 - Has Healthcare Coverage - (Yes, No, Don't know/Not sure, Refused)
# 9. ADDEPEV3 - Diagnosed with depression - (Yes, No, Don't know/Not sure, Refused)
# 10. DIABETE4 - Diagnosed with diabetes - (Yes, Yes, but female told only during pregnancy, 
# No, pre-diabetes or borderline diabetes, No, Don't know/Not sure, Refused)
# 11. RMVTETH4 - Number of teeth removed - All, 6 or more, but not all, 1 to 5, None, Don't know/Not sure, Refused
# 12. _PHYS14D - Number of days physical health not well - (Zero days when physical health not good,     
# 1-13 days when physical health not good, 14+ days when physical health not good, Don’t know/Refused/Missing)                 
# 13. _MENT14D - Number of days mental health not well - Zero days when mental health not good
# 1-13 days when mental health not good, 14+ days when mental health not good, Don’t know/Refused/Missing    
# 14. _TOTINDA - Physical activity - (Had physical activity or exercise, No physical activity or exercise in last 30 days,     
# Don’t know/Refused/Missing)       
# 15. PDIABTST - User has gotten a test for high blood sugar in past 3 years - (Yes, No, Don't know/Not sure, Refused)
# 16. PREDIAB1 - Diagnosed as prediabetic - Yes, Yes, during pregnancy, Don't know/Not Sure, Refused, No
# 17. _RFHLTH - General health - (Good or Better Health, Fair or Poor Health, Don’t know/Not Sure Or Refused/Missing)
# 18. BPHIGH4 - (Told they have high blood pressure - Yes, Told borderline high or pre-hypertensive, 
# Yes, but female told only during pregnancy, Don't Know/Not Sure Refused, No) 

# Lifestyle
# 19. CHECKUP1 - Length since last checkup - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 20. LASTDEN4 - Last visited dentist - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 21. FLUSHOT7 - Whether someone has taken the flu shot - (Yes, No, Don't know/Not sure, Refused)
# 22. _RFSEAT3 - Seatbeat wearing status - (Always Wear Seat Belt, Don’t Always Wear Seat Belt
# Don’t know/Not Sure Or Refused/Missing)

# Socioeconomic status
# 23. _IMPEDUC - Education - (College 4 years or more (College graduate), 
# College 1 year to 3 years (Some college or technical school), Grade 12 or GED (High school graduate), 
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# 24. EMPLOY1 - 
# 25. _INCOMG - Income level - ($50,000 or more, Don’t know/Not sure/Missing, $15,000 to less than $25,000,   
# $35,000 to less than $50,000, $25,000 to less than $35,000, Less than $15,000)
# 26. _METSTAT - Whether they live in a metropolitan - (1, 2)

# Tobacco, Alcohol
# 27. USENOW3 - Use of smokeless tobacco - (Not at all, Some days, Every day, Refused, Don’t know/Not Sure) 
# 28. ECIGARET - E-ciggarette usage - (Yes, No, Don't know/Not sure, Refused)
# 29. _SMOKER3 - Smoking status - (Current smoker - now smokes every day, Current smoker - now smokes some days,
# Former smoker, Never smoked, Don’t know/Refused/Missing
# 30. _RFBING5 - Binge drinking status - (Yes, No, Don’t know/Refused/Missing)                

# Columns to keep - Response variable
# 31. CVDINFR4 - Ever diagnosed with heart attack - (Yes, No, Don't know/Not sure, Refused)
# 32. CVDCRHD4 - Ever diagnosed with angina/ coronary heart disease - (Yes, No, Don't know/Not sure, Refused)

# For now we will predict heart disease
df_clean_columns = df_clean[['_RACE',
'_RACEGR3',
'_RACEG21',
'_MRACE1',
'_HISPANC',
'_M_RACE',
'_SEX',
'_PRACE1',
'_RACEPRV',
'_AGE80',
'_AGEG5YR',
'_AGE65YR',
'_AGE_G',
'HTIN4',
'HTM4',
'_BMI5',
'_BMI5CAT',
'_INCOMG',
'_RFBMI5',
'_CHLDCNT',
'_SMOKER3',
'_EDUCAG',
'_STATE',
'_RFSMOK3',
'_RFPAP35',
'REGION',
'_AIDTST4',
'_CRCREC1',
'_SBONTIM',
'_VIRCOLN',
'_STOLDNA',
'_RFBLDS4',
'_SGMS10Y',
'_SGMSCPY',
'_CLNSCPY',
'_RFPSA23',
'_MAM5023',
'DRNKANY5',
'_RFMAM22',
'_DRNKDRV',
'_RFSEAT3',
'_RFSEAT2',
'_PNEUMO3',
'_FLSHOT7',
'_RFDRHV7',
'_DRNKWK1',
'_RFBING5',
'_ALTETH3',
'DROCDY3_',
'_DENVST3',
'_STSTR',
'_EXTETH3',
'SLEPTIM1',
'RMVTETH4',
'LASTDEN4',
'DIABETE4',
'CHCKDNY2',
'ADDEPEV3',
'HAVARTH4',
'CHCCOPD2',
'CHCOCNCR',
'CHCSCNCR',
'ASTHMA3',
'CVDSTRK3',
'CVDCRHD4',
'CVDINFR4',
'EXERANY2',
'HISPANC3',
'CHECKUP1',
'MEDCOST',
'PERSDOC2',
'HLTHPLN1',
'MENTHLTH',
'PHYSHLTH',
'GENHLTH',
'SEXVAR',
'_PSU',
'SEQNO',
'IYEAR',
'IDAY',
'IMONTH',
'AGE',
'MRACE1',
'_DRDXAR2',
'_IMPAGE',
'_ASTHMS1',
'_CASTHM1',
'_LTASTH1',
'_TOTINDA',
'_HCVU651',
'_MENT14D',
'_PHYS14D',
'_RFHLTH',
'_LLCPWT',
'_IMPHOME',
'_IMPMRTL',
'_IMPEDUC',
'_IMPRACE',
'_IMPSEX',
'MARITAL',
'IDATE',
'MSCODE',
'_URBSTAT',
'_METSTAT',
'_URBNRRL',
'QSTLANG',
'QSTVER',
'CNCRTYP1',
'CNCRAGE',
'FALLINJ4',
'FALL12MN',
'RENTHOM1',
'EDUCA',
'DSRIPREG']]


In [None]:
# View non-null count 
df_clean_columns.info()

In [5]:
# Encode categorical values 
cols = list(df_clean_columns.columns)
for col in cols:
    if str(df_clean_columns[col].dtype) == 'object':
        df_clean_columns[col] = df_clean_columns[col].astype('category').cat.codes

NameError: name 'df_clean_columns' is not defined

In [9]:
'''
import seaborn as sns

import matplotlib.pyplot as plt

# taking all rows but only 6 columns
df_small = df.iloc[:,:6]

correlation_mat = df_small.corr()

sns.heatmap(correlation_mat, annot = True)

plt.show()
'''
df_clean_corr = df_clean_columns.corrwith(df_clean_columns["CVDCRHD4"])

In [14]:
df_clean_corr.sort_values(inplace=True, ascending=False)
df_clean_corr[2:30]

CVDCRHD4    1.000000
CVDINFR4    0.402811
_AGE80      0.170806
_PNEUMO3    0.170164
_FLSHOT7    0.160714
_IMPAGE     0.157250
_AGE_G      0.157250
_AGEG5YR    0.155888
DIABETE4    0.144073
CHCCOPD2    0.142714
HAVARTH4    0.142212
AGE         0.138907
FALL12MN    0.135479
CVDSTRK3    0.131904
_ALTETH3    0.126244
_AGE65YR    0.123863
FALLINJ4    0.116874
CHCKDNY2    0.114548
_RFPSA23    0.087866
CHCSCNCR    0.079083
PHYSHLTH    0.075435
ASTHMA3     0.065752
SEXVAR      0.064236
_IMPSEX     0.064236
_SEX        0.064236
MSCODE      0.062817
_CASTHM1    0.056548
CHCOCNCR    0.056530
CHECKUP1    0.056285
_LTASTH1    0.055120
dtype: float64

In [None]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)
print(df_cleaned['CVDCRHD4'])

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 'No') | (df_cleaned['CVDCRHD4'] == 'Yes')]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

In [None]:
null_vals = X_train.isnull().sum()
null_vals_desc = null_vals.sort_values(ascending=False)
null_vals_desc

In [None]:
famd = FAMD(n_components = 70, n_iter = 3, random_state = 42)
X_train_transformed = famd.fit_transform(X_train)
X_train_transformed.head()

In [None]:
RANDOM_SEED = 694

#Run dummy classifier to find the baseline performance
dummy_clf = DummyClassifier(strategy= 'most_frequent').fit(famd.fit_transform(X_train),y_train)
y_pred = dummy_clf.predict(famd.fit_transform(X_test))

In [None]:
#helper function
def get_performance_scores(y_pred, y_true):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return [f1, accuracy, precision, recall]

def print_performance_scores(scores):
    print("Accuracy Score = " + str(scores[1]))
    print("Precision Score = " + str(scores[2]))
    print("Recall Score = " + str(scores[3]))
    print("F1 Score = " + str(scores[0]))

In [None]:
#Find the performance of the model
dummy_f1 = f1_score(y_test, y_pred, average='macro')
dummy_accuracy = accuracy_score(y_test, y_pred)
dummy_precision = precision_score(y_test, y_pred, average='macro')
dummy_recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy Score = " + str(dummy_accuracy))
print("Precision Score = " + str(dummy_precision))
print("Recall Score = " + str(dummy_recall))
print("F1 Score = " + str(dummy_f1))

In [None]:
clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train_transformed, y_train)

In [None]:
X_test_transformed = famd.fit_transform(X_test)
train_preds = clf_lr.predict(X_test_transformed)

In [None]:
train_preds

In [None]:
lr_f1 = f1_score(y_test, train_preds, average='macro')
lr_accuracy = accuracy_score(y_test, train_preds)
lr_precision = precision_score(y_test, train_preds, average='macro')
lr_recall = recall_score(y_test, train_preds, average='macro')
print("Accuracy Score = " + str(lr_accuracy))
print("Precision Score = " + str(lr_precision))
print("Recall Score = " + str(lr_recall))
print("F1 Score = " + str(lr_f1))

In [None]:
#Do grid search for hyperparameter tuning
clf = LogisticRegression(random_state = RANDOM_SEED, solver='liblinear')
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1_macro')
my_list = list(range(100))
for x in tqdm(my_list):
    grid_clf_acc.fit(X_train_transformed, y_train)

y_pred_acc = grid_clf_acc.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred_acc, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train_transformed, y_train)

grid_values = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
 }

rand_search_clf = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_values, n_iter = 1, cv = 3, verbose=2, random_state=RANDOM_SEED, n_jobs = -1)

my_list = list(range(100))
for x in tqdm(my_list):
    rand_search_clf.fit(X_train_transformed[0:10000], y_train[0:10000])


y_pred_rf = rand_search_clf.predict(X_test_transformed)
   


print_performance_scores(get_performance_scores(y_pred_rf, y_test))

In [None]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_SEED )
random_forest.fit(X_train_transformed, y_train)

y_pred = random_forest.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred, y_test))