# Clean

In [1]:
import pandas as pd
import numpy as np
from numpy import asarray
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('heart_2020_cleaned.csv')
original_df = df

In [3]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [4]:
def one_hot_encode(x:np.ndarray):
    uni = np.unique(x)
    print(uni)
    result = np.zeros((len(x), len(uni)))
    for i in range(len(x)):
        for j in range(len(uni)):
            if x[i] == uni[j]:
                result[i][j] = 1;
                
    return result

In [5]:
sex_df = pd.DataFrame(encoder.fit_transform(df[['Sex']]).toarray())
sex_name = ["Female", "Male"]
sex_df.columns = sex_name

In [6]:
race_arr = pd.DataFrame(df['Race']).to_numpy()
one_hot_race = one_hot_encode(race_arr)
race_df = pd.DataFrame(one_hot_race, columns = ['American Indian/Alaskan Native', 'Asian', 'Black', 'Hispanic', 'Other', 'White'])

['American Indian/Alaskan Native' 'Asian' 'Black' 'Hispanic' 'Other'
 'White']


In [7]:
diabetic_arr = pd.DataFrame(df['Diabetic']).to_numpy()
one_hot_diabetic = one_hot_encode(diabetic_arr)
diabetic_df = pd.DataFrame(one_hot_diabetic, columns = ['Diabetic(No)', 'Diabetic(No, borderline diabetes)', 'Diabetic(Yes)', 'Diabetic(Yes, during pregnancy)'])

['No' 'No, borderline diabetes' 'Yes' 'Yes (during pregnancy)']


In [8]:
gen_health_arr = pd.DataFrame(df['GenHealth']).to_numpy()
one_hot_gen_health = one_hot_encode(gen_health_arr)
gen_health_df = pd.DataFrame(one_hot_gen_health, columns = ['GenHealth(Excellent)', 'GenHealth(Fair)', 'GenHealth(Good)', 'GenHealth(Poor)', 'GenHealth(Very good)'])

['Excellent' 'Fair' 'Good' 'Poor' 'Very good']


In [9]:
df = df.drop(['Sex', 'Race', 'GenHealth', 'Diabetic', 'AgeCategory', 'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'], axis=1)

In [10]:
df['Smoking'] = df['Smoking'].map(
                   {'Yes':1 ,'No':0})
df['HeartDisease'] = df['HeartDisease'].map(
                   {'Yes':1 ,'No':0})
df['AlcoholDrinking'] = df['AlcoholDrinking'].map(
                   {'Yes':1 ,'No':0})
df['Stroke'] = df['Stroke'].map(
                   {'Yes':1 ,'No':0})
df['DiffWalking'] = df['DiffWalking'].map(
                   {'Yes':1 ,'No':0})
df['PhysicalActivity'] = df['PhysicalActivity'].map(
                   {'Yes':1 ,'No':0})
df['Asthma'] = df['Asthma'].map(
                   {'Yes':1 ,'No':0})
df['KidneyDisease'] = df['KidneyDisease'].map(
                   {'Yes':1 ,'No':0})
df['SkinCancer'] = df['SkinCancer'].map(
                   {'Yes':1 ,'No':0})

In [11]:
df = pd.concat([df, sex_df], axis=1, join='inner')
df = pd.concat([df, race_df], axis=1, join='inner')
df = pd.concat([df, diabetic_df], axis=1, join='inner')
df = pd.concat([df, gen_health_df], axis=1, join='inner')

In [12]:
original_df['AgeCategory'] = original_df['AgeCategory'].replace(['80 or older'],'80-')
original_df['AgeCategory'] = original_df['AgeCategory'].str.split('-').str[0].astype('float64')
original_df = original_df.drop(['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'Race', 
                                'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'], axis=1)
df = pd.concat([df, original_df], axis=1, join='inner')

In [21]:
df_test = df.iloc[-100000:,:]
df = df.iloc[:-100000,:]

# PCA

In [22]:
from sklearn.decomposition import PCA

In [23]:
df_norm=(df - df.mean()) / df.std()
df_test_norm=(df_test - df_test.mean()) / df_test.std()
pca = PCA(n_components=df.shape[1])
pca.fit(df_norm)
pca_test = PCA(n_components=df_test.shape[1])
pca_test.fit(df_norm)

PCA(n_components=31)

In [24]:
df_pca = pd.DataFrame(pca.transform(df_norm), columns=df_norm.columns, index=df_norm.index)
df_test_pca = pd.DataFrame(pca_test.transform(df_test_norm), columns=df_test_norm.columns, index=df_test_norm.index)

# SVM

In [25]:
from cuml.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [26]:
X_train = df_pca.drop(columns = "HeartDisease")
y_train = df["HeartDisease"].astype(int)
X_test = df_test_pca.drop(columns = "HeartDisease")
y_test = df_test["HeartDisease"].astype(int)

In [27]:
%%time
svc = SVC()
svc.fit(X_train, y_train)

CPU times: user 1.21 s, sys: 10 ms, total: 1.22 s
Wall time: 1.26 s


SVC()

In [28]:
y_pred = svc.predict(X_test)
print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.999203187250996
0.99987
