In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading the Dataset

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
# Checking the basic stats
df.describe()

In [None]:
# Checking for dtype and non nullcount

df.info()

In [None]:
df.isna().mean() * 100 

***We do not see any null values in the dataset***

***We can start with the EDA***

# EDA

In [None]:
# importing basic libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df.columns

In [None]:
num_col = ['age', 'creatinine_phosphokinase',
       'ejection_fraction', 'platelets',
       'serum_creatinine', 'serum_sodium', 'time']
j = 1 
fig = plt.figure(figsize =(20,10))
for i in num_col: 
    plt.subplot(2,4,j)
    df[i].value_counts().plot(kind ='hist')
    plt.title(i)
    j=j+1

plt.show()

In [None]:
j = 1 

fig = plt.figure(figsize =(20,10))
for i in num_col: 
    plt.subplot(2,4,j)
    sns.distplot(df[i])
    plt.title(i)
    j=j+1

plt.show()

All the numerical fields are skewed

In [None]:
fig = plt.figure(figsize=(20,10))
j=1
for i in num_col:
    plt.subplot(2,4,j)
    sns.violinplot(df[i])
    j=j+1

***Checking the Categorical Columns Data***

In [None]:
cat_col = [ 'anaemia', 'diabetes','high_blood_pressure', 'sex', 'smoking','DEATH_EVENT']
fig = plt.figure(figsize = (15,10))
j=1
for i in cat_col:
    plt.subplot(2,3,j)
    df[i].value_counts().plot(kind ='pie',autopct = '%.2f')
    j=j+1

1. Sex - Male- 64.88 , Female- 35.12
1. Diabetes - No = 58.19, Yes = 41.81
1. Anaemia - No = 56.86 , Yes = 43.14
1. High_blood_pressure - No = 64.88, Yes = 35.12
1. Smoking - No = 67.89, Yes = 32.11
1. DEATH_EVENT - No = 67.89, Yes = 32.11

In [None]:
fig = plt.figure(figsize=(15, 10))
j = 1

for i in cat_col:
    plt.subplot(2, 3, j)
    sns.countplot(data=df, x=i)
    
    j = j + 1

plt.tight_layout()
plt.show()

We can assume here that 
1. Sex --> Gender of the patient --> 1 = Male , 0 = Female
2. Diabetes --> 0 = No , 1 = Yes
3. Anaemia --> 0 = No , 1 = Yes
4. High_blood_pressure --> 0 = No , 1 = Yes
5. Smoking --> 0 = No , 1 = Yes
6. Death_Event --> 0 = No , 1 = Yes

***Analyzing the event of death for diabetes and anaemia wrt age***

In [None]:
fig =  plt.figure(figsize = (15,10))
plt.subplot(2,2,1)
sns.barplot(x = df['DEATH_EVENT'] ,y = df['age'] , hue = df['diabetes'])
plt.title("Age vs DEATH_EVENT (Diabetes)")
plt.subplot(2,2,2)
sns.barplot(x = df['DEATH_EVENT'] , y = df['age'] , hue = df['anaemia']) 
plt.title("Age vs DEATH_EVENT (Anaemia)")
plt.show()

The chances of death increases in a person who is old and has diabetes and anaemia


In [None]:
sns.catplot(x = 'DEATH_EVENT', y= 'age', data =df, kind='bar' , hue ='diabetes',col ='high_blood_pressure')

Old people with high blood pressure and diabetic are likely to die earlier

In [None]:
fig =  plt.figure(figsize = (15,10))
plt.subplot(2,2,1)
sns.barplot(x = df['DEATH_EVENT'], y = df['creatinine_phosphokinase'],hue=df['high_blood_pressure'])
plt.title("Creatinine Phosphokinase vs DEATH_EVENT (high_blood_pressure)")
plt.subplot(2,2,2)
data = df[df['DEATH_EVENT'] == 1]
sns.barplot(x = data['anaemia'], y = data['creatinine_phosphokinase'] , hue =data['high_blood_pressure'])
plt.title("Creatinine Phosphokinase vs Anaemia(high_blood_pressure)")
plt.show()

People who are suffering from high_blood_pressure and anaemia have low creatinine_phosphokinase. So its important to have higher creatinine_phosphokinase

In [None]:
data = df[df['DEATH_EVENT'] == 1]
sns.barplot(x = data['sex'],y = data['creatinine_phosphokinase'] , hue =data['smoking'])
plt.show()

Deaths for a male smoker > female smoker also creatinine_phosphokinase is more in Male

In [None]:

fig = plt.figure(figsize=(20, 10))
j = 1

for i in num_col:
    plt.subplot(3, 4, j)
    sns.barplot(x='sex', y=i, data=df)
    plt.title(f'Bar Plot: {i} vs sex')
    j = j + 1

plt.tight_layout()
plt.show()

Male have less ejection_fraction than women

Platelets count is also slightly high in women

In [None]:
fig = plt.figure(figsize =(40,10))

# Visualizing Diabetes vs Sex
plt.subplot(1,4,1)
map = pd.crosstab(df['diabetes'],df['sex'])
sns.heatmap(map)

# Visualizing High Blood Pressure vs Sex
plt.subplot(1,4,2)
map = pd.crosstab(df['high_blood_pressure'],df['sex'])
sns.heatmap(map)

# Visualizing Smoking vs Sex
plt.subplot(1,4,3)
map = pd.crosstab(df['smoking'],df['sex'])
sns.heatmap(map)

# Visualizing Anaemia vs Sex
plt.subplot(1,4,4)
map = pd.crosstab(df['anaemia'],df['sex'])
sns.heatmap(map)

plt.show()

Men are more diabetic than women.
Men are having more blood pressure than women

In [None]:
fig = plt.figure(figsize =(20,5))
sns.distplot(df[df['DEATH_EVENT'] == 0]['age'],hist = False,color = 'green')
sns.distplot(df[df['DEATH_EVENT'] == 1]['age'],hist =False, color='red')
plt.show()

***If a person is above 70 then his chances of getting a heart failure is high***

In [None]:
death =df[df['DEATH_EVENT'] == 1]
live =df[df['DEATH_EVENT'] == 0]

fig = plt.figure(figsize =(20,5))
sns.distplot(death[death['smoking'] == 1]['age'],hist = False,color = 'green')
sns.distplot(live[live['smoking'] == 1]['age'],hist =False, color='red')

plt.show()

A Smoker has a higher chance of heartfailure having same age as that of a Non Smoker

In [None]:
death =df[df['DEATH_EVENT'] == 1]
live =df[df['DEATH_EVENT'] == 0]

fig = plt.figure(figsize =(20,5))
sns.distplot(death[death['diabetes'] == 1]['age'],hist = False,color = 'green')
sns.distplot(live[live['diabetes'] == 1]['age'],hist =False, color='red')

plt.show()

Similar result for diabetes, at a lower age chances of getting a heart failure is high for a diabetic person

In [None]:
data1 = df[df['DEATH_EVENT'] == 1]
data2 = df[df['DEATH_EVENT'] == 0]

fig = plt.figure(figsize = (20,5))
sns.distplot(data2[data2['anaemia'] == 1]['age'] , hist = False , color = 'green')
sns.distplot(data1[data1["anaemia"] == 0]['age'] , hist = False , color = 'red')

A person suffering from anemia has fatality when crossing age of 70

# Feature Engineering

In [None]:
q1 = df['platelets'].quantile(0.25)
q3 = df['platelets'].quantile(0.75)
iqr = q3 - q1
mini = q1 - 1.5* iqr
maxi = q3 + 1.5*iqr
df['platelets'].max(), maxi

In [None]:
fig = plt.figure(figsize=(20,10))
j=1
for i in num_col:
    plt.subplot(2,4,j)
    sns.boxplot(df[i])
    j=j+1

***Handling Outliers***

In [None]:
outliers = ['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']

In [None]:
df1 = df.copy()
for i in outliers:
    q1 = df[i].quantile(0.25)
    q3 = df[i].quantile(0.75)
    
    iqr  = q3 - q1
    minimum = q1 - 1.5* iqr
    maximum = q3 + 1.5*iqr
    

    df1[i] = np.where(
    df1[i] > maximum,
    maximum,
    np.where(
        df1[i] < minimum,
        minimum,
        df1[i]
        )
    )

In [None]:
fig = plt.figure(figsize=(20,10))
j=1
for i in num_col:
    plt.subplot(2,4,j)
    sns.boxplot(df1[i])
    j=j+1

***Now all the outliers are handled***

In [None]:
# importing libraries
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler

In [None]:
X = df1['creatinine_phosphokinase']

In [None]:
# Plottting QQ plots for creatinine_phosphokinase 
fig = plt.figure(figsize=(10,5))

plt.subplot(121)
stats.probplot(df1['creatinine_phosphokinase'], dist ="norm", plot = plt)
plt.show()

In [None]:
trf = FunctionTransformer(func=np.log1p)
X_train_transformed = trf.fit_transform(X)

In [None]:
plt.figure(figsize=(14,4))

plt.subplot(121)
stats.probplot(X_train_transformed, dist="norm", plot=plt)
plt.title('creatinine_phosphokinase Before Log')

plt.subplot(122)
stats.probplot(X, dist="norm", plot=plt)
plt.title('creatinine_phosphokinase After Log')

plt.show()

In [None]:
# replacing the transformed value into our original dataframe
df1['creatinine_phosphokinase']  = X_train_transformed

In [None]:
# Extarcting X and y 
X = df1.drop(columns = 'DEATH_EVENT')
y = df['DEATH_EVENT']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y ,test_size =0.2 ,random_state = 12)

In [None]:
# Feature Scaling

scaler = StandardScaler()
x_train_scaled  = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
x_train_scaled.shape

In [None]:
x_test_scaled.shape

# Model Selection

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score,recall_score,precision_score


In [None]:
lr = LogisticRegression()
lr.fit(x_train_scaled,y_train)
y_pred = lr.predict(x_test_scaled)

accuracy_score(y_test,y_pred)

In [None]:
lr2 = DecisionTreeClassifier()
lr.fit(x_train_scaled,y_train)
y_pred = lr.predict(x_test_scaled)

accuracy_score(y_test,y_pred)

In [None]:
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

Accuracy Score Metric gives around 83% accuracy