In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
data.head()

In [None]:
data.shape

In [None]:
data = data.drop_duplicates(keep='first')
data.shape

### Find relation among all columns

In [None]:
sns.heatmap(data.corr(),annot =True,fmt = '.1f',mask =np.triu(data.corr()) )

In [None]:
data.isnull().sum()

### Visualization using distplot

In [None]:
feature = data.drop('DEATH_EVENT',axis=1)
plt.figure(figsize=(15,15))
for i in enumerate(feature.columns):
    plt.subplot(4,3,i[0]+1)
    sns.distplot(feature[i[1]])

### Visualization of Outliers

In [None]:
feature = data.drop('DEATH_EVENT',axis=1)
plt.figure(figsize=(15,15))
for i in enumerate(feature.columns):
    plt.subplot(4,3,i[0]+1)
    sns.boxplot(feature[i[1]])

In [None]:
data.describe()

In [None]:
def remove(data):
  Q1 = np.percentile(data, 25, interpolation = 'midpoint')
  
# Third quartile (Q3)
  Q3 = np.percentile(data, 75, interpolation = 'midpoint')
  iqr = Q3 - Q1
  percentile25 = data.quantile(0.25)
  percentile75 = data.quantile(0.75)

  upper_limit = percentile75 + 1.5 * iqr
  lower_limit = percentile25 - 1.5 * iqr
  data = np.where(
    data > upper_limit,upper_limit,
    np.where(
        data < lower_limit,
        lower_limit,
        data
    )
  )
  return data

In [None]:
feature.columns

In [None]:
li = ['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']
for i in li:
    feature[i] = remove(feature[i])

In [None]:
#feature = data.drop('DEATH_EVENT',axis=1)
plt.figure(figsize=(15,15))
for i in enumerate(feature.columns):
    plt.subplot(4,3,i[0]+1)
    sns.boxplot(feature[i[1]])

In [None]:
#feature = data.drop('DEATH_EVENT',axis=1)
plt.figure(figsize=(15,15))
for i in enumerate(feature.columns):
    plt.subplot(4,3,i[0]+1)
    sns.distplot(feature[i[1]])

In [None]:
from sklearn.model_selection import train_test_split

trainF,testF,trainL,testL  = train_test_split(feature,data['DEATH_EVENT'],random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier()

In [None]:
model.fit(trainF,trainL)
model.score(testF,testL)

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(trainF,trainL)
model.score(testF,testL)

In [None]:
feature.columns

### Transform Data using Quantile

In [None]:
from sklearn.preprocessing import quantile_transform

li = ['age','creatinine_phosphokinase','ejection_fraction','serum_creatinine','time']

for i in feature.columns:
    feature[i] = quantile_transform(feature[[i]]
                                    ,output_distribution = 'normal')

In [None]:
plt.figure(figsize=(15,15))
for i in enumerate(feature.columns):
    plt.subplot(4,3,i[0]+1)
    sns.distplot(feature[i[1]])

In [None]:
data['DEATH_EVENT'].value_counts()

### Again Trying to fit model

In [None]:

trainF,testF,trainL,testL = train_test_split(feature, data['DEATH_EVENT'],random_state =42,test_size=.3)

In [None]:
data['DEATH_EVENT'].value_counts()

In [None]:
model = RandomForestClassifier(random_state=42)

In [None]:
model.fit(trainF,trainL)

In [None]:
model.score(testF,testL)

In [None]:
pred = model.predict(testF)
from sklearn.metrics import classification_report
print(classification_report(testL,pred))

### Parameter Tuning

In [None]:
def print_result(result):
  print('Best params : {} ---->{} \n'.format(result.best_params_,result.best_score_))
  means = result.cv_results_['mean_test_score']
  stds = result.cv_results_['std_test_score']
  for mean, std , params in zip(means, stds,result.cv_results_['params']):
    print('{} (+/- {}) for {} '.format(round(mean*100,3),round(std*2,3),params))


from sklearn.model_selection import GridSearchCV

parameters ={
    'criterion' : ['gini','entropy'],
    'n_estimators': [i**2 for i in range(3,10)],
    'random_state' : [None,42,100],
     'n_jobs':[None,-1]
    
}
 
cv = GridSearchCV(model,parameters,cv=5)
cv.fit(trainF,trainL)
print_result(cv)


In [None]:
finalmodel = RandomForestClassifier(criterion=  'entropy', n_estimators= 25, n_jobs= None, random_state= None)
finalmodel.fit(trainF,trainL)
pred = finalmodel.predict(testF)
print(classification_report(testL,pred))