# <mark>READ DATASET & PRINT DATA TABLE</mark>

In [None]:
import pandas as pd
import sklearn.datasets
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

dataFrame = pd.read_csv('heart_2020_cleaned.csv', sep=',')
label = 'HeartDisease'
featureNames = dataFrame.columns

#dataFrame.head()
dataFrame.tail()

# DOWNSAMPLING 
for a balanced dataset, it has to be down/upcycled 

## Pie chart - Show the im/balance

In [None]:
print(dataFrame[label].value_counts())
dataFrame.groupby(label).size().plot(kind='pie',
                                       y = label,
                                       label = "HeartDisease",
                                       autopct='%1.1f%%')

## <mark>Actual Downsampling</mark>

In [None]:
from sklearn.utils import resample

noHeartDiseases = dataFrame[dataFrame[label] == "No"]
withHeartDiseases  = dataFrame[dataFrame[label] == "Yes"]

noHeartDiseaseDownsample = resample(noHeartDiseases,
             replace=True,
             n_samples=len(withHeartDiseases),
             random_state=42) 
    
dataFrame = pd.concat([noHeartDiseaseDownsample, withHeartDiseases])

## Pie chart - Check the balance again

In [None]:
dataFrame.groupby(label).size().plot(kind='pie',
                                       y = label,
                                       label = "HeartDisease",
                                       autopct='%1.1f%%')
dataFrame[label].value_counts()

# DATA PREPARATION

## <mark>Transform Strings into Numbers</mark>

In [None]:
obj_list = dataFrame.select_dtypes(include='object').columns

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for obj in obj_list:
        dataFrame[obj] = le.fit_transform(dataFrame[obj].astype(str))
        
dataFrame.tail()

## <mark>Normalization</mark> 
Scaling data in a range from 0.0 to 1.0

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

dataFrameValues = dataFrame.values 
scaler = preprocessing.MinMaxScaler()
valueScaled = scaler.fit_transform(dataFrameValues)
dataFrame = pd.DataFrame(valueScaled, columns=dataFrame.columns)
dataFrame.tail()

The fit(data) method is used to compute the mean and std dev for a given feature to be used further for scaling.

The transform(data) method is used to perform scaling using mean and std dev calculated using the .fit() method.

The fit_transform() method does both fits and transform.

## Reducing Features

**be aware which features you wanna drop**

In [None]:
#dataFrame = dataFrame.drop(['AlcoholDrinking', 'PhysicalActivity', 'GenHealth', 'SleepTime', 'MentalHealth',  'Race'], axis=1)
#dataFrame.tail()

## Check the Data Quality again
### Empty Data Check

In [None]:
pd.options.mode.use_inf_as_na = True
dataFrame.isnull().sum()

## Zero Values

In [None]:
# Display how many 0 values each feature has
print((dataFrame == 0).sum())

## Correlation Matrix

In [None]:
featureLength = len(featureNames)
df_features_mean = dataFrame.iloc[:,0:featureLength]
df_features_mean.tail()

=> von Folie "Developing for AI (AIML) p.39" übernommen. => Ändert nichts am Output, also wozu?

In [None]:
xd = dataFrame[featureNames]
yd = dataFrame[label]

sns.heatmap(
    data=df_features_mean.corr(),
    annot=True,
    fmt='.2f',
    cmap='coolwarm'
)

fig = plt.gcf()
fig.set_size_inches(15, 15)

plt.show()

# ALGORITHMS

## <mark>Splitting Into Test & Training Data</mark>

In [None]:
from sklearn.model_selection import train_test_split

X = dataFrame.drop(label, axis=1).values #Feature Values
y = dataFrame[label].values #heartDiseaseValues

np.random.seed(41)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Algorithms with Overfitting Check

## KNN Overfitting Check failed

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

trainScores, testScores = list(), list()
neighbours = [i for i in range(1, 15)]

for i in neighbours:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    
    trainPrediction = knn.predict(X_train)
    trainAccuracy = accuracy_score(y_train, trainPrediction)
    trainScores.append(trainAccuracy)
    
    testPrediction = knn.predict(X_test)
    testAccuracy = accuracy_score(y_test, testPrediction)
    testScores.append(testAccuracy)
    print('>%d, train: %.3f, test: %.3f' % (i, trainAccuracy, testAccuracy))
    
pyplot.plot(neighbours, trainScores, '-o', label='Train')
pyplot.plot(neighbours, testScores, '-o', label='Test')
pyplot.legend()
pyplot.show()

=> An Overfitting Analysis like this with KNN is inappropriate, according to 

https://machinelearningmastery.com/overfitting-machine-learning-models#attachment_11578

## KNN with Cross-Validation ~73%

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

knn = KNeighborsClassifier()
scores = cross_val_score(knn, X_train, y_train, cv = 5) # cv = number of folds, default: 5
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std() * 2))

### Confusion Matrix
More detailed Evaluation of the Accuracy

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
testPrediction = knn.predict(X_test)

conf_matrix = metrics.confusion_matrix(y_test, testPrediction)

tn, fp, fn, tp = conf_matrix.ravel()
print(f"True negatives: {tn}, False positives: {fp}, False negatives: {fn}, True positives: {tp}")
print(classification_report(y_test, testPrediction))

Explanation of the report, copy pasted from the slides:

Precision: proportion of true positives among instances classified as positive, e.g.
the proportion of Iris virginica correctly identified as Iris virginica.

Recall: proportion of true positives among all positive instances in the data, e.g. the
number of sick among all diagnosed as sick.

F-1: weighted harmonic mean of precision and recall.


## Logistic Regression ~75%

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

trainScores, testScores = list(), list()
treeDepths = [i for i in range(1, 21)]

for i in treeDepths:
    model = LogisticRegression(max_iter=i)
    model.fit(X_train, y_train)
    
    trainPrediction = model.predict(X_train)
    trainAccuracy = accuracy_score(y_train, trainPrediction) * 100
    trainScores.append(trainAccuracy)
    
    testPrediction = model.predict(X_test)
    testAccuracy = accuracy_score(y_test, testPrediction) * 100
    testScores.append(testAccuracy)
    
    if i % 10 == 0: print('>%d, train: %.3f, test: %.3f' % (i, trainAccuracy, testAccuracy))

pyplot.plot(treeDepths, trainScores, '-o', label='Train')
pyplot.plot(treeDepths, testScores, '-o', label='Test')
pyplot.legend()
pyplot.show()

print("The point of NO overfitting is not existent? Stop iteration anyway at 20")
print("Score:")
print(model.score(X_test, y_test)*100)

## Random Forest Classifier ~76%

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

trainScores, testScores = list(), list()
treeDepths = [i for i in range(1, 8)]

for i in treeDepths:
    model = RandomForestClassifier(max_depth=i)
    model.fit(X_train, y_train)
    
    trainPrediction = model.predict(X_train)
    trainAccuracy = accuracy_score(y_train, trainPrediction) * 100
    trainScores.append(trainAccuracy)
    
    testPrediction = model.predict(X_test)
    testAccuracy = accuracy_score(y_test, testPrediction) * 100
    testScores.append(testAccuracy)
    
    print('>%d, train: %.3f, test: %.3f' % (i, trainAccuracy, testAccuracy))

pyplot.plot(treeDepths, trainScores, '-o', label='Train')
pyplot.plot(treeDepths, testScores, '-o', label='Test')
pyplot.legend()
pyplot.show()

print("The point of NO overfitting is 7")
print("Score:")
print(model.score(X_test, y_test)*100)

## Decision Tree Classifier ~75%

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

trainScores, testScores = list(), list()
treeDepths = [i for i in range(1, 9)]

for i in treeDepths:
    model = DecisionTreeClassifier(max_depth=i)
    model.fit(X_train, y_train)
    
    trainPrediction = model.predict(X_train)
    trainAccuracy = accuracy_score(y_train, trainPrediction) * 100
    trainScores.append(trainAccuracy)
    
    testPrediction = model.predict(X_test)
    testAccuracy = accuracy_score(y_test, testPrediction) * 100
    testScores.append(testAccuracy)
    
    print('>%d, train: %.3f, test: %.3f' % (i, trainAccuracy, testAccuracy))

pyplot.plot(treeDepths, trainScores, '-o', label='Train')
pyplot.plot(treeDepths, testScores, '-o', label='Test')
pyplot.legend()
pyplot.show()

print("The point of NO overfitting is 8")
print("Score:")
print(model.score(X_test, y_test) * 100)

## Support Vector Machines ~76%

In [None]:
from sklearn import svm

svc = svm.SVC(gamma="scale", random_state=10)
svc.fit(X_train, y_train)

print("Score:")
print(svc.score(X_test, y_test) * 100)

## NN aka Multi-Layer Perceptrons ~76%

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

trainScores, testScores = list(), list()
iterations = [i for i in range(1, 51)] #1500? takes a long time

for i in iterations:
    model = MLPClassifier(max_iter=i, random_state=10)
    model.fit(X_train, y_train)
    
    trainPrediction = model.predict(X_train)
    trainAccuracy = accuracy_score(y_train, trainPrediction) * 100
    trainScores.append(trainAccuracy)
    
    testPrediction = model.predict(X_test)
    testAccuracy = accuracy_score(y_test, testPrediction) * 100
    testScores.append(testAccuracy)
    
    print('>%d, train: %.3f, test: %.3f' % (i, trainAccuracy, testAccuracy))

pyplot.plot(iterations, trainScores, '-o', label='Train')
pyplot.plot(iterations, testScores, '-o', label='Test')
pyplot.legend()
pyplot.show()

print("The point of NO overfitting is XX")
print("Score:")
print(model.score(X_test, y_test) * 100)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=1000, random_state=10)
mlp.fit(X_train, y_train)

print("Accuracy on training set:{:.2f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set:{:.2f}".format(mlp.score(X_test, y_test)))
