In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve
from sklearn import metrics
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Fetch Data:

In [27]:
df = pd.read_csv('https://raw.githubusercontent.com/stay-fcsd/data-final-project/main/heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# Hot Encoding:
Convert categorical features into dummy columns in the feature table with a binary value for each of the possible categorical values

In [28]:
hot_encoded_df = df.copy()

gender_column = df[["Sex"]]

hot_encoded_df['Male'] = gender_column
hot_encoded_df['Female'] = gender_column

hot_encoded_df = hot_encoded_df.drop('Sex', 1)

chest_pain_type_column = df[["ChestPainType"]]
hot_encoded_df['ATA'] = chest_pain_type_column
hot_encoded_df['TA'] = chest_pain_type_column
hot_encoded_df['ASY'] = chest_pain_type_column
hot_encoded_df['NAP'] = chest_pain_type_column
hot_encoded_df = hot_encoded_df.drop('ChestPainType', 1)

resting_ECG_column = df[["RestingECG"]]
hot_encoded_df['Normal'] = resting_ECG_column
hot_encoded_df['ST'] = resting_ECG_column
hot_encoded_df['LVH'] = resting_ECG_column
hot_encoded_df = hot_encoded_df.drop('RestingECG', 1)

ST_slope_column = df[["ST_Slope"]]

hot_encoded_df['Up'] = ST_slope_column
hot_encoded_df['Flat'] = ST_slope_column
hot_encoded_df['Down'] = ST_slope_column

hot_encoded_df = hot_encoded_df.drop('ST_Slope', 1)

hot_encoded_df['Male'] = hot_encoded_df['Male'].replace(['M'], '1')
hot_encoded_df['Male'] = hot_encoded_df['Male'].replace(['F'], '0')

hot_encoded_df['Female'] = hot_encoded_df['Female'].replace(['M'], '0')
hot_encoded_df['Female'] = hot_encoded_df['Female'].replace(['F'], '1')

hot_encoded_df['ATA'] = hot_encoded_df['ATA'].replace(['ATA'], '1')
hot_encoded_df['ATA'] = hot_encoded_df['ATA'].replace(['TA'], '0')
hot_encoded_df['ATA'] = hot_encoded_df['ATA'].replace(['ASY'], '0')
hot_encoded_df['ATA'] = hot_encoded_df['ATA'].replace(['NAP'], '0')

hot_encoded_df['TA'] = hot_encoded_df['TA'].replace(['ATA'], '0')
hot_encoded_df['TA'] = hot_encoded_df['TA'].replace(['TA'], '1')
hot_encoded_df['TA'] = hot_encoded_df['TA'].replace(['ASY'], '0')
hot_encoded_df['TA'] = hot_encoded_df['TA'].replace(['NAP'], '0')

hot_encoded_df['ASY'] = hot_encoded_df['ASY'].replace(['ATA'], '0')
hot_encoded_df['ASY'] = hot_encoded_df['ASY'].replace(['TA'], '0')
hot_encoded_df['ASY'] = hot_encoded_df['ASY'].replace(['ASY'], '1')
hot_encoded_df['ASY'] = hot_encoded_df['ASY'].replace(['NAP'], '0')

hot_encoded_df['NAP'] = hot_encoded_df['NAP'].replace(['ATA'], '0')
hot_encoded_df['NAP'] = hot_encoded_df['NAP'].replace(['TA'], '0')
hot_encoded_df['NAP'] = hot_encoded_df['NAP'].replace(['ASY'], '0')
hot_encoded_df['NAP'] = hot_encoded_df['NAP'].replace(['NAP'], '1')

hot_encoded_df['Normal'] = hot_encoded_df['Normal'].replace(['Normal'], '1')
hot_encoded_df['Normal'] = hot_encoded_df['Normal'].replace(['ST'], '0')
hot_encoded_df['Normal'] = hot_encoded_df['Normal'].replace(['LVH'], '0')

hot_encoded_df['ST'] = hot_encoded_df['ST'].replace(['Normal'], '0')
hot_encoded_df['ST'] = hot_encoded_df['ST'].replace(['ST'], '1')
hot_encoded_df['ST'] = hot_encoded_df['ST'].replace(['LVH'], '0')

hot_encoded_df['LVH'] = hot_encoded_df['LVH'].replace(['Normal'], '0')
hot_encoded_df['LVH'] = hot_encoded_df['LVH'].replace(['ST'], '0')
hot_encoded_df['LVH'] = hot_encoded_df['LVH'].replace(['LVH'], '1')

hot_encoded_df['Up'] = hot_encoded_df['Up'].replace(['Up'], '1')
hot_encoded_df['Up'] = hot_encoded_df['Up'].replace(['Flat'], '0')
hot_encoded_df['Up'] = hot_encoded_df['Up'].replace(['Down'], '0')

hot_encoded_df['Flat'] = hot_encoded_df['Flat'].replace(['Up'], '0')
hot_encoded_df['Flat'] = hot_encoded_df['Flat'].replace(['Flat'], '1')
hot_encoded_df['Flat'] = hot_encoded_df['Flat'].replace(['Down'], '0')

hot_encoded_df['Down'] = hot_encoded_df['Down'].replace(['Up'], '0')
hot_encoded_df['Down'] = hot_encoded_df['Down'].replace(['Flat'], '0')
hot_encoded_df['Down'] = hot_encoded_df['Down'].replace(['Down'], '1')

hot_encoded_df['ExerciseAngina'] = hot_encoded_df['ExerciseAngina'].replace(['Y'], '1')
hot_encoded_df['ExerciseAngina'] = hot_encoded_df['ExerciseAngina'].replace(['N'], '0')

heart_disease_column = hot_encoded_df.pop('HeartDisease')
hot_encoded_df.insert(19, 'HeartDisease', heart_disease_column);

hot_encoded_df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,Male,Female,ATA,TA,ASY,NAP,Normal,ST,LVH,Up,Flat,Down,HeartDisease
0,40,140,289,0,172,0,0.0,1,0,1,0,0,0,1,0,0,1,0,0,0
1,49,160,180,0,156,0,1.0,0,1,0,0,0,1,1,0,0,0,1,0,1
2,37,130,283,0,98,0,0.0,1,0,1,0,0,0,0,1,0,1,0,0,0
3,48,138,214,0,108,1,1.5,0,1,0,0,1,0,1,0,0,0,1,0,1
4,54,150,195,0,122,0,0.0,1,0,0,0,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,0,1.2,1,0,0,1,0,0,1,0,0,0,1,0,1
914,68,144,193,1,141,0,3.4,1,0,0,0,1,0,1,0,0,0,1,0,1
915,57,130,131,0,115,1,1.2,1,0,0,0,1,0,1,0,0,0,1,0,1
916,57,130,236,0,174,0,0.0,0,1,1,0,0,0,0,0,1,0,1,0,1


# Scale Data:

In [29]:
feature_cols = ['Age', 'Male', 'Female', 'ATA', 'ASY', 'TA', 'NAP', 'RestingBP', 'Cholesterol', 'FastingBS', 'Normal', 'ST','LVH', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'Up','Flat','Down']

X = hot_encoded_df[feature_cols]

scaler = StandardScaler()
scaler.fit(X.values)
heart_disease_df_scale = scaler.transform(X.values)
df_scale = pd.DataFrame(heart_disease_df_scale, columns=X.columns)
df_scale.insert(19, 'HeartDisease', heart_disease_column);

# Linear Regression

In [30]:

X = df_scale[feature_cols]
y = df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
df_scale.head()

Unnamed: 0,Age,Male,Female,ATA,ASY,TA,NAP,RestingBP,Cholesterol,FastingBS,Normal,ST,LVH,MaxHR,ExerciseAngina,Oldpeak,Up,Flat,Down,HeartDisease
0,-1.43314,0.515952,-0.515952,2.075177,-1.084138,-0.229679,-0.532838,0.410909,0.82507,-0.551341,0.814275,-0.490449,-0.507478,1.382928,-0.823556,-0.832432,1.150674,-1.002181,-0.271448,0
1,-0.478484,-1.938163,1.938163,-0.481887,-1.084138,-0.229679,1.876744,1.491752,-0.171961,-0.551341,0.814275,-0.490449,-0.507478,0.754157,-0.823556,0.105664,-0.869056,0.997824,-0.271448,1
2,-1.751359,0.515952,-0.515952,2.075177,-1.084138,-0.229679,-0.532838,-0.129513,0.770188,-0.551341,-1.228087,2.038947,-0.507478,-1.525138,-0.823556,-0.832432,1.150674,-1.002181,-0.271448,0
3,-0.584556,-1.938163,1.938163,-0.481887,0.922392,-0.229679,-0.532838,0.302825,0.13904,-0.551341,0.814275,-0.490449,-0.507478,-1.132156,1.214246,0.574711,-0.869056,0.997824,-0.271448,1
4,0.051881,0.515952,-0.515952,-0.481887,-1.084138,-0.229679,1.876744,0.951331,-0.034755,-0.551341,0.814275,-0.490449,-0.507478,-0.581981,-0.823556,-0.832432,1.150674,-1.002181,-0.271448,0


In [31]:
feature_cols = ['Age','RestingBP','Cholesterol','FastingBS',
                'MaxHR','Oldpeak']
X = df[feature_cols]
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=3)
#creating new linear regression model
my_lin_reg = LinearRegression()
my_lin_reg.fit(X_train,y_train)

y_pred = my_lin_reg.predict(X_test)
print(y_pred)

[0.52223295 0.93414912 0.92231951 0.74827795 0.26635152 0.75857625
 1.18382503 0.63538766 0.18445595 0.45264088 0.50601304 0.35525527
 0.10782969 0.5161639  0.97510394 0.25383619 0.79251722 0.80090786
 0.61481136 0.4824058  0.46458135 0.70487153 1.23163613 0.47727843
 0.26259156 0.46916753 0.76149321 0.54559102 0.1315926  0.86485681
 1.00224239 0.46894389 0.84504013 0.30117328 0.31799957 0.22399485
 1.14595847 0.38037276 0.44279552 1.04847724 0.78321432 0.71934902
 0.25869139 0.74698929 0.29736602 0.88567021 0.6674022  0.3298803
 0.16541046 0.48217726 0.58644109 0.8268284  0.82081386 0.16519421
 0.5592906  0.64527229 0.29622488 0.20122643 0.69851636 0.59103069
 0.15193597 0.82635152 0.2778182  0.77165752 0.68837069 0.1458761
 0.47290406 1.03364261 0.34723298 0.68134066 0.29424482 0.28656035
 0.3374782  0.60463228 0.60316057 0.29940117 0.8257975  0.70461296
 0.20808463 0.41093733 1.13531513 0.53901094 0.88107528 0.62343803
 0.24921071 0.68869007 0.45323858 0.7142511  0.31848001 0.416118

In [32]:
mse = metrics.mean_squared_error(y_test, y_pred)
mse

0.1512107862645914

In [33]:
#calculating rmse(accuracy score)
rmse = np.sqrt(mse)
print('Below is our accuracy score before applying One Hot Coding:')
print(rmse)
print('Unlike other methods we use RMSE as our accuracy score. The lower the number for RMSE the better.')

Below is our accuracy score before applying One Hot Coding:
0.3888583112967902
Unlike other methods we use RMSE as our accuracy score. The lower the number for RMSE the better.


Now this will be our rmse after one hot encoding:

In [34]:
feature_cols = ['Age', 'Male', 'Female', 'ATA', 'ASY', 'TA', 'NAP', 'RestingBP', 'Cholesterol', 'FastingBS', 
                'Normal', 'ST','LVH', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'Up','Flat','Down']
X = hot_encoded_df[feature_cols]
y = hot_encoded_df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=3)

linreg = LinearRegression()
linreg.fit(X_train,y_train)

y_prediction = linreg.predict(X_test)

print(y_prediction)

[ 0.8455563   0.85465754  0.85750671  0.94717267  0.14063674  1.02173233
  1.15038925  0.69823008  0.09286257  0.3087418   0.25374795  0.20564453
 -0.11040038  0.68307353  1.26395215 -0.08292784  0.98713524  0.91470444
  0.87630181  0.67072848  0.63408449  0.83934509  0.90500798  0.78728955
  0.32814957  0.85556589  0.87254762  0.29486629  0.0491408   1.11835697
  1.01848707  0.48291306  0.97342761 -0.08240757  0.49277294 -0.19765651
  1.10685352  0.06881551  0.83999787  0.51560441  0.95040042  1.04036908
  0.09841389  0.33575941 -0.05168825  0.83484311  0.69309591  0.35614916
  0.40930372  0.88769075  0.89020022  0.7847952   0.76106106 -0.08525512
  0.60162681  0.9393379  -0.05629599  0.2878524   0.9907888   0.52325966
 -0.08909645  0.72547662  0.10232384  1.00237206  0.57322956 -0.07856279
  0.14054839  0.87395252 -0.04863982  0.63038922  0.41308115 -0.09716582
 -0.08280713  0.88380742  0.443389   -0.13385028  1.0292324   0.90594329
  0.34318611  0.12022239  1.09200536  0.61415246  0

In [35]:
#calculating mse
OHC_mse = metrics.mean_squared_error(y_test, y_prediction)
OHC_mse

0.0878453087965088

In [36]:
#Calculating rmse
OHC_rmse = np.sqrt(mse)
print('RMSE for data frame with One Hot Coding:')
print(OHC_rmse)

RMSE for data frame with One Hot Coding:
0.3888583112967902


rmse score before and after One Hot Coding are the same.

In [37]:
from sklearn.model_selection import cross_val_score
#applying 10-fold cross validation for Linear Regression
mse_list = cross_val_score(linreg, X, y, cv=10, scoring='neg_mean_squared_error')
print(mse_list)

[-0.09431385 -0.07799082 -0.06186068 -0.12878187 -0.0914898  -0.08835049
 -0.13872705 -0.17757666 -0.1532994  -0.13715838]


In [38]:
#converting our current negative mse_list to a positive one
mse_list_positive = -mse_list

# using numpy sqrt function to calculate rmse:
rmse_list = np.sqrt(mse_list_positive)
print(rmse_list)

[0.3071056  0.27926836 0.24871808 0.35886191 0.30247281 0.2972381
 0.37246081 0.42139846 0.39153468 0.370349  ]


In [39]:
#getting the average RMSE as final result
print(rmse_list.mean())

0.3349407800046112
