# 1.Imports & reading dataset <a name="1"></a>

In [1]:
# !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import pandas_profiling as pp
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score, classification_report, plot_roc_curve, plot_confusion_matrix
from mlxtend.plotting import plot_decision_regions

ModuleNotFoundError: No module named 'pandas_profiling'

In [None]:
df = pd.read_csv('heart.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

# 2.Data Description & Preprocessing<a name="2"></a>

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# visualizing null values if it exists
plt.figure(figsize=(22,10))

plt.xticks(size=20,color='grey')
plt.tick_params(size=12,color='grey')

plt.title('Finding Null Values Using Heatmap\n',color='grey',size=30)

sns.heatmap(df.isnull(),
            yticklabels=False,
            cbar=False,
            cmap='PuBu_r',
            )

In [None]:
from scipy import stats
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
df = df.reset_index(drop=True)
df

In [None]:
age_column = df['BLOOD_PRESSURE']
plt.figure(figsize=(14,8))
plt.plot(age_column, 'ro')
plt.show()

In [None]:
# pp.ProfileReport(df)

In [None]:
corr_matrix = df.corr()

plt.figure(figsize=(14,8))
sns.heatmap(corr_matrix, linewidths=.01, annot = True, cmap='Blues')
plt.show()

In [None]:
corr_matrix['TARGET'].sort_values(ascending=False)

# 3. Split data to train, test and validation dataset<a name="2"></a>

In [None]:
X = df.iloc[:, 0:13] # Features
y = df.iloc[:, 13] # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

print('Training data shape:', X_train.shape)
print('Test data shape: ', X_test.shape)
print('Validation data shape: ', X_val.shape)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
pca = PCA()  
pca_data=pca.fit_transform(X_train)
percentage_var_explained = pca.explained_variance_ratio_;  
cum_var_explained=np.cumsum(percentage_var_explained)
# plot PCA spectrum   
plt.figure(1,figsize=(14,8))
plt.clf()  
plt.plot(cum_var_explained,linewidth=2)  
plt.axis('tight')  
plt.grid() 
plt.xlabel('n_components') 
plt.ylabel('Cumulative_Variance_explained')  
plt.show()

In [None]:
pca_5 =  PCA(n_components=5)
X_train_pca_5 = pca_5.fit_transform(X_train)
X_test_pca_5 = pca_5.transform(X_test)

per_var = np.round(pca_5.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [None]:
principalDf = pd.DataFrame(data=X_train_pca_5, columns=['PC1','PC2','PC3','PC4','PC5']) 
principalDf.head()
finalDf = pd.concat([principalDf,df[['TARGET']]], axis=1)
finalDf.head()

In [None]:
explained_variance = pca_5.explained_variance_ratio_
explained_variance
plt.scatter(x=range(1, len(per_var)+1), y=explained_variance)
plt.show()

# 3. KNN classification<a name="2"></a>

In [None]:
data = X_train
data_test = X_test
target = y_train
target_test = y_test

n_neighbors = 3
kNN_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
kNN_classifier.fit(data, target)

# KNN VISUALISATION doesn't work !

# plt.figure(figsize=(15,8))
# ax = plot_decision_regions(X_train_pca_5, y_train, clf=kNN_classifier)
# plt.xlabel('1. składowa PCA')
# plt.ylabel('2. składowa PCA')
# plt.title('Wizualizacja działania algorytmu kNN')
# plt.grid()
# handles, labels = ax.get_legend_handles_labels()
# ax.legend(handles, target_names, framealpha=0.3, scatterpoints=1)

In [None]:
print('----- K-NEAREST NEIGHBORS PERFORMANCE ON TRAINING DATASET -----')
knn_predictions = kNN_classifier.predict(data)
knn_accuracy = accuracy_score(target, knn_predictions)
print('Accuracy: ', knn_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target, knn_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(kNN_classifier, data, target)  

print("ROC curve")
plot_roc_curve(kNN_classifier, data, target)  

In [None]:
print('----- K-NEAREST NEIGHBORS PERFORMANCE ON TEST DATASET -----')
knn_predictions = kNN_classifier.predict(data_test)
knn_accuracy = accuracy_score(target_test, knn_predictions)
print('Accuracy: ', knn_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target_test, knn_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(kNN_classifier, data_test, target_test)  

print("ROC curve")
plot_roc_curve(kNN_classifier, data_test, target_test)  

In [None]:
scores = cross_val_score(kNN_classifier, data, target, cv=5)
print("Scores: ", sorted(scores))

print(f"{scores.mean()} accuracy with a standard deviation of {scores.std()}")

# 4. KNN classification with PCA<a name="2"></a>

In [None]:
data = X_train_pca_5
data_test = X_test_pca_5
target = y_train
target_test = y_test

n_neighbors = 3
kNN_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
kNN_classifier.fit(data, target)

In [None]:
print('----- K-NEAREST NEIGHBORS PERFORMANCE WITH PCA ON TRAINING DATASET -----')
knn_predictions = kNN_classifier.predict(data)
knn_accuracy = accuracy_score(target, knn_predictions)
print('Accuracy: ', knn_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target, knn_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(kNN_classifier, data, target)  

print("ROC curve")
plot_roc_curve(kNN_classifier, data, target)  

In [None]:
print('----- K-NEAREST NEIGHBORS PERFORMANCE WITH PCA ON TEST DATASET -----')
knn_predictions = kNN_classifier.predict(data_test)
knn_accuracy = accuracy_score(target_test, knn_predictions)
print('Accuracy: ', knn_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target_test, knn_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(kNN_classifier, data_test, target_test)  

print("ROC curve")
plot_roc_curve(kNN_classifier, data_test, target_test)  

In [None]:
scores = cross_val_score(kNN_classifier, data_test, target_test, cv=5)
print("Scores: ", sorted(scores))

print(f"{scores.mean()} accuracy with a standard deviation of {scores.std()}")

# 5. Logistic Regression classification<a name="2"></a>

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt
# import matplotlib
# import keras
# import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
# from io import StringIO
# import requests
# from random import randint
# print(tf.__version__)
# print('keras:',keras.__version__)

In [None]:
data = X_train
data_test = X_test
target = y_train
target_test = y_test

lr_classifier = LogisticRegression(solver='newton-cg', random_state=0).fit(data, target)

In [None]:
print('----- Logistic Regression PERFORMANCE ON TRAINING DATASET -----')
lr_predictions = lr_classifier.predict(data)
lr_accuracy = accuracy_score(target, lr_predictions)
print('Accuracy: ', lr_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target, lr_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(lr_classifier, data, target)  

print("ROC curve")
plot_roc_curve(lr_classifier, data, target)  

In [None]:
print('----- Logistic Regression PERFORMANCE ON TRAINING DATASET -----')
lr_predictions = lr_classifier.predict(data_test)
lr_accuracy = accuracy_score(target_test, lr_predictions)
print('Accuracy: ', lr_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target_test, lr_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(lr_classifier, data_test, target_test)  

print("ROC curve")
plot_roc_curve(lr_classifier, data_test, target_test)  

In [None]:
scores = cross_val_score(lr_classifier, data, target, cv=5)
print("Scores: ", sorted(scores))

print(f"{scores.mean()} accuracy with a standard deviation of {scores.std()}")

# 6. Logistic Regression classification with PCA<a name="2"></a>

In [None]:
data = X_train_pca_5
data_test = X_test_pca_5
target = y_train
target_test = y_test



In [None]:
lr_classifier = LogisticRegression(solver='newton-cg', random_state=0).fit(data, target)

In [None]:
print('----- Logistic Regression PERFORMANCE ON TRAINING DATASET -----')
lr_predictions = lr_classifier.predict(data)
lr_accuracy = accuracy_score(target, lr_predictions)
print('Accuracy: ', lr_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target, lr_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(lr_classifier, data, target)  

print("ROC curve")
plot_roc_curve(lr_classifier, data, target)  

In [None]:
print('----- Logistic Regression PERFORMANCE ON TRAINING DATASET -----')
lr_predictions = lr_classifier.predict(data_test)
lr_accuracy = accuracy_score(target_test, lr_predictions)
print('Accuracy: ', lr_accuracy)

print("Classification report:")
target_names = ['class 1', 'class 2']
print(classification_report(target_test, lr_predictions, target_names=target_names))

print("Confusion matrix:")
plot_confusion_matrix(lr_classifier, data_test, target_test)  

print("ROC curve")
plot_roc_curve(lr_classifier, data_test, target_test)  

In [None]:
scores = cross_val_score(lr_classifier, data, target, cv=5)
print("Scores: ", sorted(scores))

print(f"{scores.mean()} accuracy with a standard deviation of {scores.std()}")