In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Visualisation

check to if the variables are Gaussian ??

In [None]:
# Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# gatting the datasets
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.info()

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
X = df.iloc[:, :-1]
y = df ['DEATH_EVENT']

In [None]:
df.corr()

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(data=df.corr(),annot=True, cmap='coolwarm', cbar_kws={'aspect': 50}, square = True)
plt.tight_layout()

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
data_grouped = data.groupby(by='DEATH_EVENT')
fig, axs = plt.subplots(3,3, figsize=(15, 8))
titles = list(data.select_dtypes(exclude='category'))

ax_title_pairs = zip(axs.flat, titles)

for ax, title in ax_title_pairs:
    sns.distplot(data_grouped.get_group(0)[title], bins=10, ax=ax, label='No')
    sns.distplot(data_grouped.get_group(1)[title], bins=10, ax=ax, label='Yes')
    ax.legend(title='DEATH_EVENT')

fig.tight_layout()

**The numerical features look Gaussian enough**

# Pre-processing

In [None]:
# x & y
X = df.iloc[:, :-1].values  
y = df.iloc[:, -1].values    

# Split to training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, shuffle=True)

In [None]:
X_train[:,0]

# Standardisation
For Numerical Columns

In [None]:
from sklearn.preprocessing import StandardScaler

numericals = [0,2,4,6,7,8,9,11]
for col in numericals :
    scaler =StandardScaler()
    scaler.fit(X_train[:,col].reshape(-1,1))
    X_train[:,col] = scaler.transform(X_train[:,col].reshape(-1,1)).reshape(1,-1)
    X_test[:,col] = scaler.transform(X_test[:,col].reshape(-1,1)).reshape(1,-1)

In [None]:
X_train[:,0]

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))


In [None]:
import sklearn.metrics as mt
print(mt.accuracy_score(y_pred,y_test))
mt.plot_confusion_matrix(classifier, X_test, y_test)
plt.tight_layout()

In [None]:
# dimensions before kernel : 12
X_train.shape

In [None]:
from sklearn.kernel_approximation import RBFSampler
rbf_feature = RBFSampler(gamma=1, random_state=1)
X_train_kernel = rbf_feature.fit_transform(X_train)
X_test_kernel = rbf_feature.transform(X_test)

In [None]:
# dimensions after applying kernel : 100
X_train_kernel.shape

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train_kernel, y_train)

In [None]:
y_pred_k = classifier.predict(X_test_kernel)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
import sklearn.metrics as mt
print(mt.accuracy_score(y_pred_k,y_test))
mt.plot_confusion_matrix(classifier, X_test_kernel, y_test)
plt.tight_layout()

# ://

# Naive Bayes

Since the module does not have the features to have different distributions, hence we discretise our continous features.

In [None]:
from sklearn.naive_bayes import CategoricalNB, GaussianNB 
from sklearn.preprocessing import KBinsDiscretizer, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

numericals = [0,2,4,6,7,8,9,11]
categoricals = [ i for i in range(12) if i not in numericals]

# pipelines transform and fit data in the given order
numeric_trans_nb = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('kbn', KBinsDiscretizer(n_bins=5, encode='ordinal'))])

# in this case ordinal encoder makes sense as our categorical variables such as "diabete" is an ordinal variable.
categorical_trans_nb = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(dtype=np.int64))])


# Linking transformations to their Columns
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('num', numeric_trans_nb, numericals),
        ('cat', categorical_trans_nb, categoricals)],
    remainder='passthrough')

# Applying Column Transformer
X_train_nb = preprocessor_nb.fit_transform(X_train)
X_test_nb = preprocessor_nb.transform(X_test)

In [None]:
nb_clf = CategoricalNB()


In [None]:
nb_clf.fit(X_train_nb, y_train)
y_pred_nb = nb_clf.predict(X_test_nb)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_nb)
print(cm)
accuracy_score(y_test, y_pred_nb)

# Support Vector Machines

> Linear Kernel

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='linear',random_state=572)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_svc)
print(cm)
accuracy_score(y_test, y_pred_svc)

> Changing the kernel from linear to rbf 

In [None]:
svc = SVC(kernel='rbf',random_state=572)
svc.fit(X_train, y_train)
y_pred_svc_kernel = svc.predict(X_test)
print(accuracy_score(y_test, y_pred_svc_kernel))

> Changing the value of C :

In [None]:
for num in range(1,20):
    svc = SVC(C = num, kernel='rbf',random_state=572)
    svc.fit(X_train, y_train)
    y_pred_svc = svc.predict(X_test)
    print(f'accuracy score : {accuracy_score(y_test, y_pred_svc)}, for C : {num}')

**How system prediction changes with respect to C**

In [None]:
C_val = [i for i in range(1,100)]
performance = []
for i in C_val :
    svc = SVC(C = i, kernel='rbf',random_state=572)
    svc.fit(X_train, y_train)
    y_pred_svc = svc.predict(X_test)
    performance.append(accuracy_score(y_test, y_pred_svc))
plt.plot(C_val, performance)
plt.xlabel('C_values')
plt.ylabel('Perfromance')
plt.tight_layout()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred_svc)

> How increasing depth of the tree changes the performance over the training set vs. the test set.

In [None]:
depth = [i for i in range(4,12)]
train_performance = []
test_performance = []
for i in depth :
    DT = DecisionTreeClassifier(max_depth = i, criterion = 'entropy', random_state = 572)
    DT.fit(X_train, y_train)
    # train_pred
    train_pred = DT.predict(X_train)
    train_performance.append(accuracy_score(y_train, train_pred))
    # test_pred
    test_pred = DT.predict(X_test)
    test_performance.append(accuracy_score(y_test, test_pred))
    
plt.plot(depth, train_performance, color='blue')
plt.plot(depth, test_performance, color='red')
plt.xlabel('depth')
plt.ylabel('Perfromance')
plt.tight_layout()

*5 seems to be the optimal value*

# Random Forest
Using a numebr of decision trees and averaging them to reduce variance.


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier( n_estimators = 100, criterion = 'entropy', random_state = 572)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred_svc)

> Performance variation due to changing the number of trees used :

In [None]:
depth = [i for i in range(4,100)]
train_performance = []
test_performance = []
for i in depth :
    RF = RandomForestClassifier(n_estimators= i, criterion = 'entropy', random_state = 572)
    RF.fit(X_train, y_train)
    # train_pred
    train_pred = RF.predict(X_train)
    train_performance.append(accuracy_score(y_train, train_pred))
    # test_pred
    test_pred = RF.predict(X_test)
    test_performance.append(accuracy_score(y_test, test_pred))
    
plt.plot(depth, train_performance, color='blue')
plt.plot(depth, test_performance, color='red')
plt.xlabel('depth')
plt.ylabel('Perfromance')
plt.tight_layout()