# READ DATASET & PRINT DATA TABLE

In [None]:
import pandas as pd
import sklearn.datasets
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

dataFrame = pd.read_csv('heart_2020_cleaned.csv', sep=',')
label = 'HeartDisease'
feature_names = dataFrame.columns

#dataFrame.head()
dataFrame.tail()

# DOWNSAMPLING 
for a balanced dataset, it has to be down/upcycled 

## Pie chart - Show the im/balance

In [None]:
print(dataFrame[label].value_counts())
dataFrame.groupby(label).size().plot(kind='pie',
                                       y = label,
                                       label = "HeartDisease",
                                       autopct='%1.1f%%')

## Actual Downsampling

In [None]:
from sklearn.utils import resample

noHeartDiseases = dataFrame[dataFrame[label] == "No"]
withHeartDiseases  = dataFrame[dataFrame[label] == "Yes"]

noHeartDiseaseDownsample = resample(noHeartDiseases,
             replace=True,
             n_samples=len(withHeartDiseases),
             random_state=42) 
    
dataFrame = pd.concat([noHeartDiseaseDownsample, withHeartDiseases])

## Pie chart - Check the balance again

In [None]:
dataFrame.groupby(label).size().plot(kind='pie',
                                       y = label,
                                       label = "HeartDisease",
                                       autopct='%1.1f%%')
dataFrame[label].value_counts()

# DATA PREPARATION
## Transform Strings into Numbers

In [None]:
dataFrame.head()

In [None]:
obj_list = dataFrame.select_dtypes(include='object').columns

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for obj in obj_list:
        dataFrame[obj] = le.fit_transform(dataFrame[obj].astype(str))
        
dataFrame.head()

## Check the Data Quality again
### Empty Data Check

In [None]:
pd.options.mode.use_inf_as_na = True
dataFrame.isnull().sum()

## Zero Values

In [None]:
# Display how many 0 value each feature has
for field in dataFrame.columns:
    print('Number of 0-entries for "{field_name}" feature: {amount}'.format(
        field_name=field,
        amount=np.count_nonzero(dataFrame[field] == 0)
    ))

## Correlation Matrix

In [None]:
featureNames = dataFrame.columns
xd = dataFrame[featureNames]
yd = dataFrame[label]

sns.heatmap(
    data=xd.corr(),
    annot=True,
    fmt='.2f',
    cmap='RdYlGn'
)

fig = plt.gcf()
fig.set_size_inches(15, 15)

plt.show()

# ALGORITHMS

## Splitting into Test & Training Data

In [None]:
from sklearn.model_selection import train_test_split

X = dataFrame.drop(label, axis=1).values #Feature Values
y = dataFrame[label].values #heartDiseaseValues

np.random.seed(41)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## KNN ~71%

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

train_scores, test_scores = list(), list()
iterations = [i for i in range(1, 10)]

for i in iterations:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    #knn.score(X_test, y_test)
    
    trainPrediction = knn.predict(X_train)
    trainAccuracy = accuracy_score(y_train, trainPrediction)
    train_scores.append(trainAccuracy)
    
    testPrediction = knn.predict(X_test)
    testAccuracy = accuracy_score(y_test, testPrediction)
    test_scores.append(testAccuracy)
    print('>%d, train: %.3f, test: %.3f' % (i, trainAccuracy, testAccuracy))
    
pyplot.plot(iterations, train_scores, '-o', label='Train')
pyplot.plot(iterations, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()

## Check if Overfitted
taking a look onto the difference between the accuracy score of the training and the test data
Wide difference? -> Overfitted!

In [None]:
from sklearn.metrics import accuracy_score
trainPrediction = knn.predict(X_train)
trainAccuracy = accuracy_score(trainPrediction, y_train)

testPrediction = knn.predict(X_test)
testAccuracy = accuracy_score(testPrediction, y_test)

print(trainAccuracy)
print(testAccuracy)

## Logistic Regression ~75% Aber mit Fehlermeldung

In [None]:
from sklearn.linear_model import LogisticRegression
np.random.seed(41)
lr = LogisticRegression().fit(X_train, y_train)
lr.score(X_test, y_test)

## Random Forest Classifier ~75%

In [None]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(41)
rf = RandomForestClassifier().fit(X_train, y_train)
rf.score(X_test, y_test)

## Decision Tree Classifier ~70%

In [None]:
from sklearn.tree import DecisionTreeClassifier

train_scores, test_scores = list(), list()
# define the tree depths to evaluate
iterations = [i for i in range(1, 21)]
# evaluate a decision tree for each depth
for i in iterations:
	# configure the model
	model = DecisionTreeClassifier(max_depth=i)
	# fit model on the training dataset
	model.fit(X_train, y_train)
	# evaluate on the train dataset
	trainPrediction = model.predict(X_train)
	trainAccuracy = accuracy_score(y_train, trainPrediction)
	train_scores.append(trainAccuracy)
	# evaluate on the test dataset
	testPrediction = model.predict(X_test)
	testAccuracy = accuracy_score(y_test, testPrediction)
	test_scores.append(testAccuracy)
	# summarize progress
	print('>%d, train: %.3f, test: %.3f' % (i, trainAccuracy, testAccuracy))
# plot of train and test scores vs tree depth
pyplot.plot(iterations, train_scores, '-o', label='Train')
pyplot.plot(iterations, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()
print("Score:")
print(model.score(X_test, y_test))