The purpose of this notebook is to try different supervised learning models to find if we can predict if someone would be classfied as at risk of having a heart attack or not. The models in this notebook will be trained on the raw dataset. We will use the scores from these models and compare them to the models trained on the dataset transformed by FAMD to see if using FAMD increases our scores. We use the same cleaning steps as the data that is transformed so that we know if the increase in score is coming directly from transforming the data with FAMD

In [1]:
import pandas as pd
import numpy as np
from prince import FAMD
import zipfile 
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


In [2]:
# Read in DataFrame
zf = zipfile.ZipFile('ny.csv.zip') 
zf.namelist() 
df = pd.read_csv(zf.open('ny.csv'),  encoding = 'cp1252')
#df = pd.read_csv('ny.csv', encoding = 'cp1252')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [5]:
# Step 5: Encode categorical variables as numeric to calculate correlations
df_clean_categorical = df_clean.copy()
cols = list(df_clean_categorical.columns)
for col in cols:
    if str(df_clean_categorical[col].dtype) == 'object':
        df_clean_categorical[col] = df_clean_categorical[col].astype('category').cat.codes

df_clean.head()

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,CELLSEX,SEXVAR,GENHLTH,...,WTRSOURCE,STRSMEAL,FRUITVEG,MJUSE30,MJNYSMMP,HEPCTEST,HEPCTOLD,_WT2SPLITS,REGION,DSRIPREG
0,New York,1152020,1,15,2020,2020002528,2020002528,,Male,Very good,...,Public Water Supply,Always,,,No,No,No,5130.843243,NYS exclusive of NYC,Long Island
1,New York,1302020,1,30,2020,2020002529,2020002529,,Male,Very good,...,Public Water Supply,Never,,,No,No,No,941.550458,NYS exclusive of NYC,Long Island
2,New York,1152020,1,15,2020,2020002530,2020002530,,Female,Good,...,Public Water Supply,Never,,,No,No,No,3497.366203,NYS exclusive of NYC,Long Island
3,New York,2032020,2,3,2020,2020004509,2020004509,,Female,Good,...,Public Water Supply,Never,Neighborhood,,No,,,1187.70903,NYS exclusive of NYC,Long Island
4,New York,2152020,2,15,2020,2020002531,2020002531,,Male,Fair,...,Don't Know/Not Sure,Rarely,,,No,No,No,13364.387863,NYS exclusive of NYC,Long Island


In [6]:
#Step 7: Create correlation matrix to find which features to use for mca
df_clean_corr = df_clean_categorical.corrwith(df_clean_categorical["CVDCRHD4"])
df_clean_corr_abs = df_clean_corr.abs()
df_clean_corr_abs.sort_values(inplace=True, ascending=False)
df_clean_corr_abs

feature_list = list(df_clean_corr_abs[0:100].keys())
feature_list.remove('CVDINFR4')
feature_list.remove('_MICHD')
feature_list

df_clean_columns = df_clean[feature_list]
df_clean_columns.head()

Unnamed: 0,CVDCRHD4,_AGE80,_PNEUMO3,_RFHLTH,_FLSHOT7,_AGE_G,_IMPAGE,_AGEG5YR,DIABETE4,_DRDXAR2,...,LASTDEN4,WTRSOURCE,ADDEPEV3,_IMPMRTL,_RACEGR3,STRSMEAL,DRNKANY5,_BMI5CAT,_PRACE1,_MRACE1
0,No,Imputed Age 65 to 69,No,Good or Better Health,Yes,Age 65 or older,Age 65 or older,Age 65 to 69,No,Not diagnosed with arthritis,...,Within the past year (anytime less than 12 mon...,Public Water Supply,No,Married,"Other race only, Non-Hispanic",Always,Yes,Obese,Asian,Asian Only
1,No,Imputed Age 55 to 59,Age Less Than 65,Good or Better Health,Age Less Than 65,Age 55 to 64,Age 55 to 64,Age 55 to 59,No,Not diagnosed with arthritis,...,Within the past year (anytime less than 12 mon...,Public Water Supply,No,Married,"White only, Non-Hispanic",Never,Yes,Obese,White,White only
2,No,Imputed Age 80 or older,Yes,Good or Better Health,Yes,Age 65 or older,Age 65 or older,Age 80 or older,Yes,Diagnosed with arthritis,...,Within the past year (anytime less than 12 mon...,Public Water Supply,No,Widowed,"White only, Non-Hispanic",Never,No,Normal Weight,White,White only
3,No,Imputed Age 80 or older,Yes,Good or Better Health,Yes,Age 65 or older,Age 65 or older,Age 80 or older,No,Diagnosed with arthritis,...,Within the past year (anytime less than 12 mon...,Public Water Supply,Yes,Widowed,"White only, Non-Hispanic",Never,Yes,Overweight,White,White only
4,No,Imputed Age 40 to 44,Age Less Than 65,Fair or Poor Health,Age Less Than 65,Age 35 to 44,Age 35 to 44,Age 40 to 44,Yes,Not diagnosed with arthritis,...,Within the past year (anytime less than 12 mon...,Don't Know/Not Sure,Yes,Married,Hispanic,Rarely,No,Overweight,Other race,Other race only


In [7]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 'No') | (df_cleaned['CVDCRHD4'] == 'Yes')]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

In [9]:
#copy of cleaned df to change categorical data to numeric
numeric_df = df_cleaned.copy()

#2 categories

#replacing no = 0 and yes = 1 
#numeric_df['CVDCRHD4'] = numeric_df.CVDCRHD4.eq('Yes').mul(1)
numeric_df['CVDCRHD4'] = numeric_df.CVDCRHD4.map({'Yes':1, 'No':0})
#replacing male = 0 and female = 1
numeric_df['SEXVAR'] = numeric_df.SEXVAR.eq('Male').mul(0)


#3 categories

#replacing no/bad = 0, yes/good = 1, idk/missing/refused = 2
numeric_df['_TOTINDA'] = numeric_df._TOTINDA.map({'Had physical activity or exercise':1, 'No physical activity or exercise in last 30 days':0, 'Don’t know/Refused/Missing':2})
#numeric_df['_RFSEAT3'] = numeric_df._RFSEAT3.map({'Always Wear Seat Belt':1, 'Don’t Always Wear Seat Belt':0, 'Don’t know/Not Sure Or Refused/Missing':2})
numeric_df['_RFHLTH'] = numeric_df._RFHLTH.map({'Good or Better Health':1, 'Fair or Poor Health':0, 'Don’t know/Not Sure Or Refused/Missing':2})
numeric_df['_RFBING5'] = numeric_df._RFBING5.map({'Yes':1, 'No':0, 'Don’t know/Refused/Missing':2})
numeric_df['_RFBMI5'] = numeric_df._RFBMI5.map({'Yes':1, 'No':0, 'Don’t know/Refused/Missing':2})

#4 categories

#replacing no = 0, yes = 1, idk/missing = 2 refused = 3
numeric_df['FLUSHOT7'] = numeric_df.FLUSHOT7.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2,  'Refused': 3 })
numeric_df['ECIGARET'] = numeric_df.ECIGARET.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2, 'Refused': 3 })
numeric_df['VETERAN3'] = numeric_df.VETERAN3.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2, 'Refused': 3 })
numeric_df['ADDEPEV3'] = numeric_df.ADDEPEV3.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2, 'Refused': 3 })
## ADDEPEV3 contains NAN changing those to value 3 == Refused to answer
numeric_df['ADDEPEV3'] = numeric_df['ADDEPEV3'].fillna(3)
numeric_df['HLTHPLN1'] = numeric_df.HLTHPLN1.map({'Yes':1, 'No':0, 'Don’t know/Not sure':2, 'Refused': 3 })
## HLTHPLN1 contains NAN changing those to value 3 == Refused to answer
numeric_df['HLTHPLN1'] = numeric_df['HLTHPLN1'].fillna(3)
numeric_df['PDIABTST'] = numeric_df.PDIABTST.map({'Yes':1, 'No':0, 'Don’t know/Not sure':2, 'Refused': 3 })
## PDIABTST contains NAN changing those to value 3 == Refused to answer
numeric_df['PDIABTST'] = numeric_df['PDIABTST'].fillna(3)
#replacing non = 0, most = 1, some = 2, idk/missing = 3
numeric_df['_PHYS14D'] = numeric_df._PHYS14D.map({'14+ days when physical health not good':1, 'Zero days when physical health not good':0, '1-13 days when physical health not good': 2, 'Don’t know/Refused/Missing':3})
numeric_df['_MENT14D'] = numeric_df._MENT14D.map({'14+ days when physical health not good':1, 'Zero days when physical health not good':0, '1-13 days when physical health not good': 2, 'Don’t know/Refused/Missing':3})
## _MENT14D contains NAN changing those to value 3 == Refused to answer
numeric_df['_MENT14D'] = numeric_df['_MENT14D'].fillna(3)
    ##does _MENT14D really add to our project? every one answered with a value of 3 (idk/missing)

#5 categories

#replacing no = 0, yes = 1, some = 2, idk/missing = 3, refused = 4
numeric_df['USENOW3'] = numeric_df.USENOW3.map({'Every day':1, 'Not at all':0, 'Some days': 2, 'Don’t know/Not Sure':3, 'Refused': 4 })
numeric_df['PREDIAB1'] = numeric_df.PREDIAB1.map({'Yes':1, 'No':0, 'Yes, during pregnancy': 2, 'Don’t know/Not Sure':3, 'Refused': 4 })
numeric_df['DIABETE4'] = numeric_df.DIABETE4.map({'Yes, but female told only during pregnancy':1, 'No':0, 'No, pre-diabetes or borderline diabetes': 2, 'Don’t know/Not Sure':3, 'Refused': 4 })
#replacing no = 0, yes = 1, some = 2, former = 3 ,idk/missing = 4 
numeric_df['_SMOKER3'] = numeric_df._SMOKER3.map({'Current smoker - now smokes every day':1, 'Never smoked':0, 'Current smoker - now smokes some days': 2, 'Don’t know/Refused/Missing':3, 'Former smoker': 4 })

#6 categories

#replacing Never married = 0, Married = 1, Separated = 2, Divorced = 3 , Widowed = 4, A member of an unmarried couple = 5 
numeric_df['_IMPMRTL'] = numeric_df._IMPMRTL.map({'Married':1, 'Never married':0, 'Separated': 2, 'Divorced':3, 'Widowed': 4, 'A member of an unmarried couple': 5})
#replacing American Indian/Alaskan Native, Non-Hispanic = 0, Asian, Non-Hispanic = 1, Black, Non-Hispanic = 2, Hispanic = 3 ,Other race, Non-Hispanic = 4,  White, Non-Hispanic = 5
numeric_df['_IMPRACE'] = numeric_df._IMPRACE.map({'Asian, Non-Hispanic':1, 'American Indian/Alaskan Native, Non-Hispanic':0, 'Black, Non-Hispanic': 2, 'Hispanic':3, 'Other race, Non-Hispanic': 4, 'White, Non-Hispanic': 5})
#replacing Age 18 to 24 = 0, Age 25 to 34 = 1, Age 35 to 44 = 2, Age 45 to 54 = 3 , Age 55 to 64 = 4,  Age 65 or older = 5
numeric_df['_IMPAGE'] = numeric_df._IMPAGE.map({'Age 25 to 34':1, 'Age 18 to 24':0, 'Age 35 to 44': 2, 'Age 45 to 54':3, 'Age 55 to 64': 4, 'Age 65 or older': 5})
#replacing No = 0, Yes = 1, borderline = 2, preg = 3 , Don't Know/Not Sure = 4,  Refused = 5
numeric_df['BPHIGH4'] = numeric_df.BPHIGH4.map({'No':0, 'Yes':1, 'Told borderline high or pre-hypertensive': 2, 'Yes, but female told only during pregnancy':3, 'Don’t know/Not Sure': 4, 'Refused': 5})
## BPHIGH4 contains NAN changing those to value 5 == Refused to answer
numeric_df['BPHIGH4'] = numeric_df['BPHIGH4'].fillna(5)
#replacing None = 0, 1 to 5 = 1, 6 or more, but not all = 2, All = 3 , Don't Know/Not Sure = 4,  Refused = 5
numeric_df['RMVTETH4'] = numeric_df.RMVTETH4.map({'None':0, '1 to 5':1, '6 or more, but not all': 2, 'GED':3, 'College 1 year to 3 years (Some college or technical school)': 4, 'College 4 years or more (College graduate)': 5, 'Refused': 6})
## RMVTETH4 contains NAN changing those to value 6 == Refused to answer
numeric_df['RMVTETH4'] = numeric_df['RMVTETH4'].fillna(6)


#replacing None = 0, elementary = 1, some high school = 2, high school = 3 , Don't Know/Not Sure = 4,  Refused = 5
numeric_df['_IMPEDUC'] = numeric_df._IMPEDUC.map({'Never attended school or only kindergarten':0, 'Grades 1 through 8 (Elementary)':1, 'Grades 9 through 11 (Some high school)': 2, 'Grade 12 or GED (High school graduate)':3, 'College 1 year to 3 years (Some college or technical school)':4, 'College 4 years or more (College graduate)': 5})
#replacing < $15,000 = 0, $15,000 < $25,000 = 1, $25,000 < $35,000 = 2, $35,000 < $50,000 = 3 , $50,000 or more = 4,  Don’t know/Not sure/Missing = 5
numeric_df['_INCOMG'] = numeric_df._INCOMG.map({'Less than $15,000':0, '$15,000 to less than $25,000':1, '$25,000 to less than $35,000': 2, '$35,000 to less than $50,000':3, '$50,000 or more':4, 'Don’t know/Not sure/Missing': 5})

#7 categories

#replacing Never = 0, 1 yr = 1, less than 2 years ago = 2, less than 5 years ago = 3 , 5 or more years ago = 4,  Don’t know/Not sure = 5, refused = 6
numeric_df['CHECKUP1'] = numeric_df.CHECKUP1.map({'Never':0, 'Within past year (anytime less than 12 months ago)':1, 'Within past 2 years (1 year but less than 2 years ago)': 2, 'Within past 5 years (2 years but less than 5 years ago)':3, '5 or more years ago':4, 'Don’t know/Not sure': 5, 'Refused': 6})
numeric_df['LASTDEN4'] = numeric_df.LASTDEN4.map({'Never':0, 'Within past year (anytime less than 12 months ago)':1, 'Within past 2 years (1 year but less than 2 years ago)': 2, 'Within past 5 years (2 years but less than 5 years ago)':3, '5 or more years ago':4, 'Don’t know/Not sure': 5, 'Refused': 6})
## LASTDEN4 contains 6274 NAN values, Changing these to a value of 6 which is equal to refused to answer.
numeric_df['LASTDEN4'] = numeric_df['LASTDEN4'].fillna(6)

#9 categories

#replacing homemaker = 0, A student = 1, Employed for wages = 2, Self-employed	0  = 3 , Unable to work = 4,  Out of work for < 1 year = 5, Out of work for > year = 6, Retired = 7, Refused = 8
numeric_df['EMPLOY1'] = numeric_df.EMPLOY1.map({'A homemaker':0, 'A student':1, 'Employed for wages': 2, 'Self-employed':3, 'Unable to work':4, 'Out of work for less than 1 year': 5, 'Out of work for 1 year or more': 6, 'Retired': 7, 'Refused': 8})


numeric_df.columns
#numeric_df = numeric_df.dropna()
numeric_df
#numeric_df.isnull().sum()
#new_df.groupby('_INCOMG').sum()

AttributeError: 'DataFrame' object has no attribute '_RFBING5'

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

X = numeric_df.loc[:, numeric_df.columns != 'CVDCRHD4']
y = numeric_df['CVDCRHD4']


# # Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
#Logistic Regression
RANDOM_SEED = 694

clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train, y_train)
y_pred = clf_lr.predict(X_test)
lr_f1 = f1_score(y_test, y_pred, average = 'macro')
lr_accuracy = accuracy_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred, average='macro')
lr_recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy Score = " + str(lr_accuracy))
print("Precision Score = " + str(lr_precision))
print("Recall Score = " + str(lr_recall))
print("F1 Score = " + str(lr_f1))


In [None]:
#K NEAREST NEIGHBOR
knn_clf=KNeighborsRegressor()
knnreg = KNeighborsRegressor(n_neighbors = 83).fit(X_train, y_train)
r2 = knnreg.score(X_test, y_test)
r2


In [None]:
#RANDOM FOREST
random_forest = RandomForestClassifier(random_state = RANDOM_SEED)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

rf_f1 = f1_score(y_test, y_pred, average = 'macro')
rf_accuracy = accuracy_score(y_test, y_pred)
rf_precision = precision_score(y_test, y_pred, average='macro')
rf_recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy Score = " + str(rf_accuracy))
print("Precision Score = " + str(rf_precision))
print("Recall Score = " + str(rf_recall))
print("F1 Score = " + str(rf_f1))

In [None]:
#POLYNOMIAL REGRESSION
degs = (1, 3, 7, 11)

X = numeric_df.loc[:, numeric_df.columns != 'CVDCRHD4']
y = numeric_df['CVDCRHD4']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)
# for d in degs:
poly = PolynomialFeatures(degree = 3)
    #print(poly)
        
X_poly = poly.fit_transform(X)
       
        
linreg = LinearRegression().fit(X_train, y_train)
r2_train =linreg.score(X_train, y_train)
    
print(r2_train)
