In [None]:
import pandas as pd
import numpy as np
from prince import FAMD
import zipfile 
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [None]:
# Read in DataFrame
zf = zipfile.ZipFile('ny.csv.zip') 
zf.namelist() 
df = pd.read_csv(zf.open('ny.csv'),  encoding = 'cp1252')
#df = pd.read_csv('ny.csv', encoding = 'cp1252')


In [None]:
# View first 5 rows of the data

df.head(5)

In [None]:
# Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [None]:
# View first 5 rows of DataFrame after cleaning out missing values
df_clean.head(5)

In [None]:
# Select relevant columns related to heart disease by utilizing resources detailing factors of heart disease
# shorturl.at/oqwF5 - Behavioral risk factors of coronary artery disease: A paired matched case control study
# shorturl.at/cpAXZ - Strategies to prevent heart disease
# shorturl.at/gpwAR - Top five habits that harm the heart
# shorturl.at/mtJUZ - 9 Common Habits That Are Bad for Your Heart

list(df_clean.columns)

In [None]:
# Characteristics
# 1. SEXVAR - Sex - (Male or Female)
# 2. _IMPAGE - Age - (Age 65 or older, Age 55 - 64, Age 45 - 54, Age 35 - 44, Age 25 - 34, Age 18 - 24)
# 3. _IMPRACE - Race - (White, Non-Hispanic, Hispanic, Black, Non-Hispanic, Other race, Non-Hispanic, Asian, Non-Hispanic,
# American Indian/Alaskan Native, Non-Hispanic)
# 4. VETERAN3 - Former veteran status - (Yes, No, Refused, Don't know/Not sure)
# 5. WTKG3 - Weight in KG - (Continous value)
# 6. _IMPMRTL - Marital status - (Married, Never Married, Divorced, Widowed, A member of an unmarried couple, 
# Separated)
# 7. _RFBMI5 - Overweight or Obese - (Yes, No, Don’t know/Refused/Missing)


# Health 
# 8. HLTHPLN1 - Has Healthcare Coverage - (Yes, No, Don't know/Not sure, Refused)
# 9. ADDEPEV3 - Diagnosed with depression - (Yes, No, Don't know/Not sure, Refused)
# 10. DIABETE4 - Diagnosed with diabetes - (Yes, Yes, but female told only during pregnancy, 
# No, pre-diabetes or borderline diabetes, No, Don't know/Not sure, Refused)
# 11. RMVTETH4 - Number of teeth removed - All, 6 or more, but not all, 1 to 5, None, Don't know/Not sure, Refused
# 12. _PHYS14D - Number of days physical health not well - (Zero days when physical health not good,     
# 1-13 days when physical health not good, 14+ days when physical health not good, Don’t know/Refused/Missing)                 
# 13. _MENT14D - Number of days mental health not well - Zero days when mental health not good
# 1-13 days when mental health not good, 14+ days when mental health not good, Don’t know/Refused/Missing    
# 14. _TOTINDA - Physical activity - (Had physical activity or exercise, No physical activity or exercise in last 30 days,     
# Don’t know/Refused/Missing)       
# 15. PDIABTST - User has gotten a test for high blood sugar in past 3 years - (Yes, No, Don't know/Not sure, Refused)
# 16. PREDIAB1 - Diagnosed as prediabetic - Yes, Yes, during pregnancy, Don't know/Not Sure, Refused, No
# 17. _RFHLTH - General health - (Good or Better Health, Fair or Poor Health, Don’t know/Not Sure Or Refused/Missing)
# 18. BPHIGH4 - (Told they have high blood pressure - Yes, Told borderline high or pre-hypertensive, 
# Yes, but female told only during pregnancy, Don't Know/Not Sure Refused, No) 

# Lifestyle
# 19. CHECKUP1 - Length since last checkup - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 20. LASTDEN4 - Last visited dentist - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 21. FLUSHOT7 - Whether someone has taken the flu shot - (Yes, No, Don't know/Not sure, Refused)
# 22. _RFSEAT3 - Seatbeat wearing status - (Always Wear Seat Belt, Don’t Always Wear Seat Belt
# Don’t know/Not Sure Or Refused/Missing)

# Socioeconomic status
# 23. _IMPEDUC - Education - (College 4 years or more (College graduate), 
# College 1 year to 3 years (Some college or technical school), Grade 12 or GED (High school graduate), 
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# 24. EMPLOY1 - 
# 25. _INCOMG - Income level - ($50,000 or more, Don’t know/Not sure/Missing, $15,000 to less than $25,000,   
# $35,000 to less than $50,000, $25,000 to less than $35,000, Less than $15,000)
# 26. _METSTAT - Whether they live in a metropolitan - (1, 2)

# Tobacco, Alcohol
# 27. USENOW3 - Use of smokeless tobacco - (Not at all, Some days, Every day, Refused, Don’t know/Not Sure) 
# 28. ECIGARET - E-ciggarette usage - (Yes, No, Don't know/Not sure, Refused)
# 29. _SMOKER3 - Smoking status - (Current smoker - now smokes every day, Current smoker - now smokes some days,
# Former smoker, Never smoked, Don’t know/Refused/Missing
# 30. _RFBING5 - Binge drinking status - (Yes, No, Don’t know/Refused/Missing)                

# Columns to keep - Response variable
# 31. CVDINFR4 - Ever diagnosed with heart attack - (Yes, No, Don't know/Not sure, Refused)
# 32. CVDCRHD4 - Ever diagnosed with angina/ coronary heart disease - (Yes, No, Don't know/Not sure, Refused)

# For now we will predict heart disease
df_clean_columns = df_clean[['SEXVAR', '_IMPAGE', '_IMPRACE', 'VETERAN3', 'WTKG3', '_IMPMRTL', '_RFBMI5', 
                             'HLTHPLN1', 'ADDEPEV3', 'DIABETE4', 'RMVTETH4', '_PHYS14D', '_MENT14D', '_TOTINDA',
                             'PDIABTST', 'PREDIAB1', '_RFHLTH', 'BPHIGH4', 'CHECKUP1', 'LASTDEN4', 'FLUSHOT7', 
                             '_RFSEAT3', '_IMPEDUC', 'EMPLOY1', '_INCOMG', '_METSTAT', 'USENOW3', 'ECIGARET',
                             '_SMOKER3', '_RFBING5', 'CVDCRHD4']]

In [None]:
df_clean_columns

In [None]:
# View non-null count 
df_clean_columns.info()

In [None]:
"""
# Encode categorical values 
cols = list(df_clean_columns.columns)
for col in cols:
    if str(df_clean_columns[col].dtype) == 'object':
        df_clean_columns[col] = df_clean_columns[col].astype('category').cat.codes
"""    

In [None]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 'No') | (df_cleaned['CVDCRHD4'] == 'Yes')]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

In [None]:
from prince import FAMD
import plotly.express as px

# Find optimal number of components for FAMD
optimal_components = pd.DataFrame(columns = ['num_components', 'explained_variance'])

for i in range(1, 151):
    
    # Get val of x
    num_components = i
    
    # Initialize FAMD
    famd = FAMD(n_components = i, n_iter = 3, random_state = 42)
    famd.fit_transform(X_train)
    
    # Calculate explained variance
    explained_variance = famd.explained_inertia_.sum()
    
    # Insert into dataframe
    row = {'num_components': num_components, 'explained_variance' : explained_variance}
    optimal_components = optimal_components.append(row, ignore_index = True)
    
fig = px.scatter(optimal_components, x = 'num_components', y = 'explained_variance')
fig.show()

# Print max 
optimal_components.loc[optimal_components['explained_variance'] == optimal_components['explained_variance'].max()]

In [None]:
# Initialize FAMD
famd = FAMD(n_components = 120, n_iter = 3, random_state = 42)
famd.fit_transform(X_train)

famd_explained_variance = famd.explained_inertia_
df_famd_explained = pd.DataFrame(famd_explained_variance)
df_famd_explained['component'] = list(range(1, 121))
df_famd_explained.columns = ['explained_variance', 'component']

In [None]:
# Explained variance for each component

fig = px.bar(df_famd_explained, x = 'component', y = 'explained_variance')
fig

In [None]:
#copy of cleaned df to change categorical data to numeric
numeric_df = df_cleaned.copy()

#2 categories

#replacing no = 0 and yes = 1 
#numeric_df['CVDCRHD4'] = numeric_df.CVDCRHD4.eq('Yes').mul(1)
numeric_df['CVDCRHD4'] = numeric_df.CVDCRHD4.map({'Yes':1, 'No':0})
#replacing male = 0 and female = 1
numeric_df['SEXVAR'] = numeric_df.SEXVAR.eq('Male').mul(0)


#3 categories

#replacing no/bad = 0, yes/good = 1, idk/missing/refused = 2
numeric_df['_TOTINDA'] = numeric_df._TOTINDA.map({'Had physical activity or exercise':1, 'No physical activity or exercise in last 30 days':0, 'Don’t know/Refused/Missing':2})
numeric_df['_RFSEAT3'] = numeric_df._RFSEAT3.map({'Always Wear Seat Belt':1, 'Don’t Always Wear Seat Belt':0, 'Don’t know/Not Sure Or Refused/Missing':2})
numeric_df['_RFHLTH'] = numeric_df._RFHLTH.map({'Good or Better Health':1, 'Fair or Poor Health':0, 'Don’t know/Not Sure Or Refused/Missing':2})
numeric_df['_RFBING5'] = numeric_df._RFBING5.map({'Yes':1, 'No':0, 'Don’t know/Refused/Missing':2})
numeric_df['_RFBMI5'] = numeric_df._RFBMI5.map({'Yes':1, 'No':0, 'Don’t know/Refused/Missing':2})

#4 categories

#replacing no = 0, yes = 1, idk/missing = 2 refused = 3
numeric_df['FLUSHOT7'] = numeric_df.FLUSHOT7.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2,  'Refused': 3 })
numeric_df['ECIGARET'] = numeric_df.ECIGARET.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2, 'Refused': 3 })
numeric_df['VETERAN3'] = numeric_df.VETERAN3.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2, 'Refused': 3 })
numeric_df['ADDEPEV3'] = numeric_df.ADDEPEV3.map({'Yes':1, 'No':0, 'Don’t know/Not Sure':2, 'Refused': 3 })
## ADDEPEV3 contains NAN changing those to value 3 == Refused to answer
numeric_df['ADDEPEV3'] = numeric_df['ADDEPEV3'].fillna(3)
numeric_df['HLTHPLN1'] = numeric_df.HLTHPLN1.map({'Yes':1, 'No':0, 'Don’t know/Not sure':2, 'Refused': 3 })
## HLTHPLN1 contains NAN changing those to value 3 == Refused to answer
numeric_df['HLTHPLN1'] = numeric_df['HLTHPLN1'].fillna(3)
numeric_df['PDIABTST'] = numeric_df.PDIABTST.map({'Yes':1, 'No':0, 'Don’t know/Not sure':2, 'Refused': 3 })
## PDIABTST contains NAN changing those to value 3 == Refused to answer
numeric_df['PDIABTST'] = numeric_df['PDIABTST'].fillna(3)
#replacing non = 0, most = 1, some = 2, idk/missing = 3
numeric_df['_PHYS14D'] = numeric_df._PHYS14D.map({'14+ days when physical health not good':1, 'Zero days when physical health not good':0, '1-13 days when physical health not good': 2, 'Don’t know/Refused/Missing':3})
numeric_df['_MENT14D'] = numeric_df._MENT14D.map({'14+ days when physical health not good':1, 'Zero days when physical health not good':0, '1-13 days when physical health not good': 2, 'Don’t know/Refused/Missing':3})
## _MENT14D contains NAN changing those to value 3 == Refused to answer
numeric_df['_MENT14D'] = numeric_df['_MENT14D'].fillna(3)
    ##does _MENT14D really add to our project? every one answered with a value of 3 (idk/missing)

#5 categories

#replacing no = 0, yes = 1, some = 2, idk/missing = 3, refused = 4
numeric_df['USENOW3'] = numeric_df.USENOW3.map({'Every day':1, 'Not at all':0, 'Some days': 2, 'Don’t know/Not Sure':3, 'Refused': 4 })
numeric_df['PREDIAB1'] = numeric_df.PREDIAB1.map({'Yes':1, 'No':0, 'Yes, during pregnancy': 2, 'Don’t know/Not Sure':3, 'Refused': 4 })
numeric_df['DIABETE4'] = numeric_df.DIABETE4.map({'Yes, but female told only during pregnancy':1, 'No':0, 'No, pre-diabetes or borderline diabetes': 2, 'Don’t know/Not Sure':3, 'Refused': 4 })
#replacing no = 0, yes = 1, some = 2, former = 3 ,idk/missing = 4 
numeric_df['_SMOKER3'] = numeric_df._SMOKER3.map({'Current smoker - now smokes every day':1, 'Never smoked':0, 'Current smoker - now smokes some days': 2, 'Don’t know/Refused/Missing':3, 'Former smoker': 4 })

#6 categories

#replacing Never married = 0, Married = 1, Separated = 2, Divorced = 3 , Widowed = 4, A member of an unmarried couple = 5 
numeric_df['_IMPMRTL'] = numeric_df._IMPMRTL.map({'Married':1, 'Never married':0, 'Separated': 2, 'Divorced':3, 'Widowed': 4, 'A member of an unmarried couple': 5})
#replacing American Indian/Alaskan Native, Non-Hispanic = 0, Asian, Non-Hispanic = 1, Black, Non-Hispanic = 2, Hispanic = 3 ,Other race, Non-Hispanic = 4,  White, Non-Hispanic = 5
numeric_df['_IMPRACE'] = numeric_df._IMPRACE.map({'Asian, Non-Hispanic':1, 'American Indian/Alaskan Native, Non-Hispanic':0, 'Black, Non-Hispanic': 2, 'Hispanic':3, 'Other race, Non-Hispanic': 4, 'White, Non-Hispanic': 5})
#replacing Age 18 to 24 = 0, Age 25 to 34 = 1, Age 35 to 44 = 2, Age 45 to 54 = 3 , Age 55 to 64 = 4,  Age 65 or older = 5
numeric_df['_IMPAGE'] = numeric_df._IMPAGE.map({'Age 25 to 34':1, 'Age 18 to 24':0, 'Age 35 to 44': 2, 'Age 45 to 54':3, 'Age 55 to 64': 4, 'Age 65 or older': 5})
#replacing No = 0, Yes = 1, borderline = 2, preg = 3 , Don't Know/Not Sure = 4,  Refused = 5
numeric_df['BPHIGH4'] = numeric_df.BPHIGH4.map({'No':0, 'Yes':1, 'Told borderline high or pre-hypertensive': 2, 'Yes, but female told only during pregnancy':3, 'Don’t know/Not Sure': 4, 'Refused': 5})
## BPHIGH4 contains NAN changing those to value 5 == Refused to answer
numeric_df['BPHIGH4'] = numeric_df['BPHIGH4'].fillna(5)
#replacing None = 0, 1 to 5 = 1, 6 or more, but not all = 2, All = 3 , Don't Know/Not Sure = 4,  Refused = 5
numeric_df['RMVTETH4'] = numeric_df.RMVTETH4.map({'None':0, '1 to 5':1, '6 or more, but not all': 2, 'GED':3, 'College 1 year to 3 years (Some college or technical school)': 4, 'College 4 years or more (College graduate)': 5, 'Refused': 6})
## RMVTETH4 contains NAN changing those to value 6 == Refused to answer
numeric_df['RMVTETH4'] = numeric_df['RMVTETH4'].fillna(6)


#replacing None = 0, elementary = 1, some high school = 2, high school = 3 , Don't Know/Not Sure = 4,  Refused = 5
numeric_df['_IMPEDUC'] = numeric_df._IMPEDUC.map({'Never attended school or only kindergarten':0, 'Grades 1 through 8 (Elementary)':1, 'Grades 9 through 11 (Some high school)': 2, 'Grade 12 or GED (High school graduate)':3, 'College 1 year to 3 years (Some college or technical school)':4, 'College 4 years or more (College graduate)': 5})
#replacing < $15,000 = 0, $15,000 < $25,000 = 1, $25,000 < $35,000 = 2, $35,000 < $50,000 = 3 , $50,000 or more = 4,  Don’t know/Not sure/Missing = 5
numeric_df['_INCOMG'] = numeric_df._INCOMG.map({'Less than $15,000':0, '$15,000 to less than $25,000':1, '$25,000 to less than $35,000': 2, '$35,000 to less than $50,000':3, '$50,000 or more':4, 'Don’t know/Not sure/Missing': 5})

#7 categories

#replacing Never = 0, 1 yr = 1, less than 2 years ago = 2, less than 5 years ago = 3 , 5 or more years ago = 4,  Don’t know/Not sure = 5, refused = 6
numeric_df['CHECKUP1'] = numeric_df.CHECKUP1.map({'Never':0, 'Within past year (anytime less than 12 months ago)':1, 'Within past 2 years (1 year but less than 2 years ago)': 2, 'Within past 5 years (2 years but less than 5 years ago)':3, '5 or more years ago':4, 'Don’t know/Not sure': 5, 'Refused': 6})
numeric_df['LASTDEN4'] = numeric_df.LASTDEN4.map({'Never':0, 'Within past year (anytime less than 12 months ago)':1, 'Within past 2 years (1 year but less than 2 years ago)': 2, 'Within past 5 years (2 years but less than 5 years ago)':3, '5 or more years ago':4, 'Don’t know/Not sure': 5, 'Refused': 6})
## LASTDEN4 contains 6274 NAN values, Changing these to a value of 6 which is equal to refused to answer.
numeric_df['LASTDEN4'] = numeric_df['LASTDEN4'].fillna(6)

#9 categories

#replacing homemaker = 0, A student = 1, Employed for wages = 2, Self-employed	0  = 3 , Unable to work = 4,  Out of work for < 1 year = 5, Out of work for > year = 6, Retired = 7, Refused = 8
numeric_df['EMPLOY1'] = numeric_df.EMPLOY1.map({'A homemaker':0, 'A student':1, 'Employed for wages': 2, 'Self-employed':3, 'Unable to work':4, 'Out of work for less than 1 year': 5, 'Out of work for 1 year or more': 6, 'Retired': 7, 'Refused': 8})


numeric_df.columns
#numeric_df = numeric_df.dropna()
numeric_df
#numeric_df.isnull().sum()
#new_df.groupby('_INCOMG').sum()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

X = numeric_df.loc[:, numeric_df.columns != 'CVDCRHD4']
y = numeric_df['CVDCRHD4']


# # Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
#Logistic Regression
RANDOM_SEED = 694

clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train, y_train)
y_pred = clf_lr.predict(X_test)
lr_f1 = f1_score(y_test, y_pred, average = 'macro')
lr_accuracy = accuracy_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred, average='macro')
lr_recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy Score = " + str(lr_accuracy))
print("Precision Score = " + str(lr_precision))
print("Recall Score = " + str(lr_recall))
print("F1 Score = " + str(lr_f1))


In [None]:
#K NEAREST NEIGHBOR
knn_clf=KNeighborsRegressor()
knnreg = KNeighborsRegressor(n_neighbors = 83).fit(X_train, y_train)
r2 = knnreg.score(X_test, y_test)
r2


In [None]:
#RANDOM FOREST
random_forest = RandomForestClassifier(random_state = RANDOM_SEED)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

rf_f1 = f1_score(y_test, y_pred, average = 'macro')
rf_accuracy = accuracy_score(y_test, y_pred)
rf_precision = precision_score(y_test, y_pred, average='macro')
rf_recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy Score = " + str(rf_accuracy))
print("Precision Score = " + str(rf_precision))
print("Recall Score = " + str(rf_recall))
print("F1 Score = " + str(rf_f1))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

#POLYNOMIAL REGRESSION
degs = (1, 3, 7, 11)

X = numeric_df.loc[:, numeric_df.columns != 'CVDCRHD4']
y = numeric_df['CVDCRHD4']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)
# for d in degs:
poly = PolynomialFeatures(degree = 3)
    #print(poly)
        
X_poly = poly.fit_transform(X)
       
        
linreg = LinearRegression().fit(X_train, y_train)
r2_train =linreg.score(X_train, y_train)
    
print(r2_train)
