# Project: Heart Faliure Classifiction

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score as mi
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [23]:
df=pd.read_csv("D:\Applied Data Science & AI\Project\Heart Faliure Prediction\heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [24]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [25]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [26]:
df.columns= df.columns.str.lower().str.replace(' ','_')

In [27]:
categorical_column=list(df.dtypes[df.dtypes=='object'].index)
for c in categorical_column:
    df[c]=df[c].str.lower().str.replace(' ','_')

In [28]:
df.dtypes

age                 int64
sex                object
chestpaintype      object
restingbp           int64
cholesterol         int64
fastingbs           int64
restingecg         object
maxhr               int64
exerciseangina     object
oldpeak           float64
st_slope           object
heartdisease        int64
dtype: object

In [29]:
df.isnull().sum()

age               0
sex               0
chestpaintype     0
restingbp         0
cholesterol       0
fastingbs         0
restingecg        0
maxhr             0
exerciseangina    0
oldpeak           0
st_slope          0
heartdisease      0
dtype: int64

## Splitting Data into Train and Test set with 60-20-20 ratio

In [31]:
df_full_train,df_test= train_test_split(df,test_size=0.2,random_state=1)
df_train,df_val= train_test_split(df_full_train,test_size=0.25,random_state=1)

In [32]:
len(df_train),len(df_val),len(df_test)

(550, 184, 184)

In [33]:
df_train=df_train.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_full_train=df_full_train.reset_index(drop=True)

In [34]:
y_train=df_train.heartdisease.values
y_val=df_val.heartdisease.values
y_test=df_test.heartdisease.values
y_full_train=df_full_train.heartdisease.values

In [35]:
del df_train['heartdisease']
del df_val['heartdisease']
del df_test['heartdisease']

## EDA

In [37]:
df_full_train.isnull().sum()

age               0
sex               0
chestpaintype     0
restingbp         0
cholesterol       0
fastingbs         0
restingecg        0
maxhr             0
exerciseangina    0
oldpeak           0
st_slope          0
heartdisease      0
dtype: int64

In [38]:
df_full_train.heartdisease.value_counts(normalize=True)

1    0.542234
0    0.457766
Name: heartdisease, dtype: float64

In [39]:
global_heartD_rate=df.heartdisease.mean()
round(global_heartD_rate,2)

0.55

In [40]:
df.dtypes

age                 int64
sex                object
chestpaintype      object
restingbp           int64
cholesterol         int64
fastingbs           int64
restingecg         object
maxhr               int64
exerciseangina     object
oldpeak           float64
st_slope           object
heartdisease        int64
dtype: object

In [41]:
numeric=['age','restingbp','cholesterol','fastingbs','maxhr','oldpeak']

In [43]:
categorical=[
     'sex', 'chestpaintype', 'restingecg', 'exerciseangina',  'st_slope'
]

In [44]:
df_full_train[categorical].nunique()

sex               2
chestpaintype     4
restingecg        3
exerciseangina    2
st_slope          3
dtype: int64

In [48]:
def Mutual_information(series):
    return mi(series,df_full_train.heartdisease)

In [53]:
m= df_full_train[categorical].apply(Mutual_information)
m.sort_values(ascending=False)

st_slope          0.184472
chestpaintype     0.151883
exerciseangina    0.123210
sex               0.046335
restingecg        0.006789
dtype: float64

## One Hot Encodding

In [56]:
train_dict= df_train[categorical+numeric].to_dict(orient='records')

In [58]:
dv=DictVectorizer(sparse=False)

In [59]:
X_train=dv.fit_transform(train_dict)

In [62]:
features_name=list(dv.get_feature_names_out())
features_name

['age',
 'chestpaintype=asy',
 'chestpaintype=ata',
 'chestpaintype=nap',
 'chestpaintype=ta',
 'cholesterol',
 'exerciseangina=n',
 'exerciseangina=y',
 'fastingbs',
 'maxhr',
 'oldpeak',
 'restingbp',
 'restingecg=lvh',
 'restingecg=normal',
 'restingecg=st',
 'sex=f',
 'sex=m',
 'st_slope=down',
 'st_slope=flat',
 'st_slope=up']

In [63]:
val_dict=df_val[categorical+numeric].to_dict(orient='records')

In [64]:
X_val=dv.transform(val_dict)

In [65]:
del df_full_train['heartdisease']

In [66]:
full_train_dict= df_full_train[categorical+numeric].to_dict(orient='records')
X_full_train=dv.transform(full_train_dict)

In [77]:
dict_full_test=df_test[categorical+numeric].to_dict(orient='records')
X_test=dv.transform(dict_full_test)

In [75]:
model=LogisticRegression()
model.fit(X_train,y_train)
y_pred=model.predict_proba(X_val)[:,1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [76]:
score=roc_auc_score(y_val,y_pred)
score

0.9091124822021832

In [80]:
model.fit(X_full_train,y_full_train)
y_final=model.predict_proba(X_test)[:,1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
score_final=roc_auc_score(y_test,y_final)
score_final

0.954791154791155