In [29]:
import pandas as pd
import numpy as np
from prince import FAMD
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [30]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [31]:
# Read in DataFrame
df = pd.read_csv('../NYSDOH_BRFSS_SurveyData_2020.csv', encoding = 'cp1252')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [32]:
# View first 5 rows of the data

df.head(5)

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,COLGSEX,LANDSEX,RESPSLCT,...,PPS_40,PPS_43,PPS_44,PPS_45,PPS_46,PPS_48,PPS_52,SEXNOCON_OTH,NOVEGFRU_OTH,CHILDREN
0,New York,1152020,1,15,2020,2020002528,2020002528,Not asked or Missing,Not asked or Missing,Male,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
1,New York,1302020,1,30,2020,2020002529,2020002529,Not asked or Missing,Not asked or Missing,Male,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
2,New York,1152020,1,15,2020,2020002530,2020002530,Not asked or Missing,Female,Not asked or Missing,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
3,New York,2032020,2,3,2020,2020004509,2020004509,Not asked or Missing,Not asked or Missing,Female,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
4,New York,2152020,2,15,2020,2020002531,2020002531,Not asked or Missing,Not asked or Missing,Female,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing


In [33]:
# Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [34]:
# View first 5 rows of DataFrame after cleaning out missing values
df_clean.head(5)

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,CELLSEX,SEXVAR,GENHLTH,...,WTRSOURCE,STRSMEAL,FRUITVEG,MJUSE30,MJNYSMMP,HEPCTEST,HEPCTOLD,_WT2SPLITS,REGION,DSRIPREG
0,New York,1152020,1,15,2020,2020002528,2020002528,,Male,Very good,...,Public Water Supply,Always,,,No,No,No,5130.843243,NYS exclusive of NYC,Long Island
1,New York,1302020,1,30,2020,2020002529,2020002529,,Male,Very good,...,Public Water Supply,Never,,,No,No,No,941.550458,NYS exclusive of NYC,Long Island
2,New York,1152020,1,15,2020,2020002530,2020002530,,Female,Good,...,Public Water Supply,Never,,,No,No,No,3497.366203,NYS exclusive of NYC,Long Island
3,New York,2032020,2,3,2020,2020004509,2020004509,,Female,Good,...,Public Water Supply,Never,Neighborhood,,No,,,1187.70903,NYS exclusive of NYC,Long Island
4,New York,2152020,2,15,2020,2020002531,2020002531,,Male,Fair,...,Don't Know/Not Sure,Rarely,,,No,No,No,13364.387863,NYS exclusive of NYC,Long Island


In [35]:
# Select relevant columns related to heart disease by utilizing resources detailing factors of heart disease
# shorturl.at/oqwF5 - Behavioral risk factors of coronary artery disease: A paired matched case control study
# shorturl.at/cpAXZ - Strategies to prevent heart disease
# shorturl.at/gpwAR - Top five habits that harm the heart
# shorturl.at/mtJUZ - 9 Common Habits That Are Bad for Your Heart

list(df_clean.columns)

['_STATE',
 'IDATE',
 'IMONTH',
 'IDAY',
 'IYEAR',
 'SEQNO',
 '_PSU',
 'CELLSEX',
 'SEXVAR',
 'GENHLTH',
 'PHYSHLTH',
 'MENTHLTH',
 'HLTHPLN1',
 'PERSDOC2',
 'MEDCOST',
 'CHECKUP1',
 'EXERANY2',
 'SLEPTIM1',
 'CVDINFR4',
 'CVDCRHD4',
 'CVDSTRK3',
 'ASTHMA3',
 'CHCSCNCR',
 'CHCOCNCR',
 'CHCCOPD2',
 'HAVARTH4',
 'ADDEPEV3',
 'CHCKDNY2',
 'DIABETE4',
 'LASTDEN4',
 'RMVTETH4',
 'AGE',
 'HISPANC3',
 'MRACE1',
 'MARITAL',
 'EDUCA',
 'RENTHOM1',
 'VETERAN3',
 'EMPLOY1',
 'INCOME2',
 'WEIGHT2',
 'HEIGHT3',
 'DEAF',
 'BLIND',
 'DECIDE',
 'DIFFWALK',
 'DIFFDRES',
 'DIFFALON',
 'SMOKE100',
 'USENOW3',
 'ALCDAY5',
 'AVEDRNK3',
 'DRNK3GE5',
 'MAXDRNKS',
 'FLUSHOT7',
 'SHINGLE2',
 'PNEUVAC4',
 'FALL12MN',
 'FALLINJ4',
 'SEATBELT',
 'DRNKDRI2',
 'COLNSCPY',
 'SIGMSCPY',
 'BLDSTOL1',
 'STOOLDNA',
 'VIRCOLON',
 'HIVTST7',
 'HIVRISK5',
 'PDIABTST',
 'PREDIAB1',
 'ECIGARET',
 'CNCRAGE',
 'CNCRTYP1',
 'BIRTHSEX',
 'SOFEMALE',
 'ADHISPA',
 'QSTVER',
 'QSTLANG',
 '_URBNRRL',
 '_METSTAT',
 '_URBSTAT',
 'MSCO

In [36]:
# Characteristics
# 1. SEXVAR - Sex - (Male or Female)
# 2. _IMPAGE - Age - (Age 65 or older, Age 55 - 64, Age 45 - 54, Age 35 - 44, Age 25 - 34, Age 18 - 24)
# 3. _IMPRACE - Race - (White, Non-Hispanic, Hispanic, Black, Non-Hispanic, Other race, Non-Hispanic, Asian, Non-Hispanic,
# American Indian/Alaskan Native, Non-Hispanic)
# 4. VETERAN3 - Former veteran status - (Yes, No, Refused, Don't know/Not sure)
# 5. WTKG3 - Weight in KG - (Continous value)
# 6. _IMPMRTL - Marital status - (Married, Never Married, Divorced, Widowed, A member of an unmarried couple, 
# Separated)
# 7. _RFBMI5 - Overweight or Obese - (Yes, No, Don’t know/Refused/Missing)


# Health 
# 8. HLTHPLN1 - Has Healthcare Coverage - (Yes, No, Don't know/Not sure, Refused)
# 9. ADDEPEV3 - Diagnosed with depression - (Yes, No, Don't know/Not sure, Refused)
# 10. DIABETE4 - Diagnosed with diabetes - (Yes, Yes, but female told only during pregnancy, 
# No, pre-diabetes or borderline diabetes, No, Don't know/Not sure, Refused)
# 11. RMVTETH4 - Number of teeth removed - All, 6 or more, but not all, 1 to 5, None, Don't know/Not sure, Refused
# 12. _PHYS14D - Number of days physical health not well - (Zero days when physical health not good,     
# 1-13 days when physical health not good, 14+ days when physical health not good, Don’t know/Refused/Missing)                 
# 13. _MENT14D - Number of days mental health not well - Zero days when mental health not good
# 1-13 days when mental health not good, 14+ days when mental health not good, Don’t know/Refused/Missing    
# 14. _TOTINDA - Physical activity - (Had physical activity or exercise, No physical activity or exercise in last 30 days,     
# Don’t know/Refused/Missing)       
# 15. PDIABTST - User has gotten a test for high blood sugar in past 3 years - (Yes, No, Don't know/Not sure, Refused)
# 16. PREDIAB1 - Diagnosed as prediabetic - Yes, Yes, during pregnancy, Don't know/Not Sure, Refused, No
# 17. _RFHLTH - General health - (Good or Better Health, Fair or Poor Health, Don’t know/Not Sure Or Refused/Missing)
# 18. BPHIGH4 - (Told they have high blood pressure - Yes, Told borderline high or pre-hypertensive, 
# Yes, but female told only during pregnancy, Don't Know/Not Sure Refused, No) 

# Lifestyle
# 19. CHECKUP1 - Length since last checkup - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 20. LASTDEN4 - Last visited dentist - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 21. FLUSHOT7 - Whether someone has taken the flu shot - (Yes, No, Don't know/Not sure, Refused)
# 22. _RFSEAT3 - Seatbeat wearing status - (Always Wear Seat Belt, Don’t Always Wear Seat Belt
# Don’t know/Not Sure Or Refused/Missing)

# Socioeconomic status
# 23. _IMPEDUC - Education - (College 4 years or more (College graduate), 
# College 1 year to 3 years (Some college or technical school), Grade 12 or GED (High school graduate), 
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# 24. EMPLOY1 - 
# 25. _INCOMG - Income level - ($50,000 or more, Don’t know/Not sure/Missing, $15,000 to less than $25,000,   
# $35,000 to less than $50,000, $25,000 to less than $35,000, Less than $15,000)
# 26. _METSTAT - Whether they live in a metropolitan - (1, 2)

# Tobacco, Alcohol
# 27. USENOW3 - Use of smokeless tobacco - (Not at all, Some days, Every day, Refused, Don’t know/Not Sure) 
# 28. ECIGARET - E-ciggarette usage - (Yes, No, Don't know/Not sure, Refused)
# 29. _SMOKER3 - Smoking status - (Current smoker - now smokes every day, Current smoker - now smokes some days,
# Former smoker, Never smoked, Don’t know/Refused/Missing
# 30. _RFBING5 - Binge drinking status - (Yes, No, Don’t know/Refused/Missing)                

# Columns to keep - Response variable
# 31. CVDINFR4 - Ever diagnosed with heart attack - (Yes, No, Don't know/Not sure, Refused)
# 32. CVDCRHD4 - Ever diagnosed with angina/ coronary heart disease - (Yes, No, Don't know/Not sure, Refused)

# For now we will predict heart disease
df_clean_columns = df_clean.columns

In [37]:
# View non-null count 
df_clean_columns.info()

AttributeError: 'Index' object has no attribute 'info'

In [None]:
# Encode categorical values 
cols = list(df_clean_columns.columns)
for col in cols:
    if str(df_clean_columns[col].dtype) == 'object':
        df_clean_columns[col] = df_clean_columns[col].astype('category').cat.codes 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_columns[col] = df_clean_columns[col].astype('category').cat.codes


In [None]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 0) | (df_cleaned['CVDCRHD4'] == 1)]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

In [None]:
#helper function
def get_performance_scores(y_pred, y_true):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return [f1, accuracy, precision, recall]

def print_performance_scores(scores):
    print("Accuracy Score = " + str(scores[1]))
    print("Precision Score = " + str(scores[2]))
    print("Recall Score = " + str(scores[3]))
    print("F1 Score = " + str(scores[0]))

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train, y_train)
X_train_pca = pca.fit_transform(X_train)
print(pca.explained_variance_ratio_)

[9.99989289e-01 1.98393147e-06]


In [None]:
RANDOM_SEED = 694

#Run dummy classifier to find the baseline performance
dummy_clf = DummyClassifier(strategy= 'most_frequent').fit(X_train,y_train)
y_pred = dummy_clf.predict(X_test)
print_performance_scores(get_performance_scores(y_pred, y_test))

Accuracy Score = 0.9918189255522225
Precision Score = 0.49590946277611125
Recall Score = 0.5
F1 Score = 0.4979463307776561


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train_pca, y_train)

In [None]:
X_test_transformed = pca.fit_transform(X_test)
train_preds = clf_lr.predict(X_test_transformed)
print_performance_scores(get_performance_scores(train_preds, y_test))

Accuracy Score = 0.9918189255522225
Precision Score = 0.49590946277611125
Recall Score = 0.5
F1 Score = 0.4979463307776561


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
train_preds

array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [None]:
#Do grid search for hyperparameter tuning
clf = LogisticRegression(random_state = RANDOM_SEED, solver='liblinear')
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1_macro')
my_list = list(range(100))
for x in tqdm(my_list):
    grid_clf_acc.fit(X_train_pca, y_train)

y_pred_acc = grid_clf_acc.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred_acc, y_test))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]



Accuracy Score = 0.9918189255522225
Precision Score = 0.49590946277611125
Recall Score = 0.5
F1 Score = 0.4979463307776561


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train_transformed, y_train)

grid_values = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
 }

rand_search_clf = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_values, n_iter = 1, cv = 3, verbose=2, random_state=RANDOM_SEED, n_jobs = -1)

my_list = list(range(100))
for x in tqdm(my_list):
    rand_search_clf.fit(X_train_pca[0:10000], y_train[0:10000])


y_pred_rf = rand_search_clf.predict(X_test_transformed)
   


print_performance_scores(get_performance_scores(y_pred_rf, y_test))

NameError: name 'X_train_transformed' is not defined

In [None]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_SEED )
random_forest.fit(X_train_pca, y_train)

y_pred = random_forest.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred, y_test))

Accuracy Score = 0.9918189255522225
Precision Score = 0.49590946277611125
Recall Score = 0.5
F1 Score = 0.4979463307776561


  _warn_prf(average, modifier, msg_start, len(result))
