# Setup

## Libs

In [None]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
 
sns.set_style("white") # change le style par défaut des graphiques seaborn
%matplotlib inline

## Load file

In [None]:
df = pd.read_csv("data/heart_2020_cleaned.csv")
df.head()

## Clean (i hope)

### Clean Yes/No to 1/0

In [None]:
column_to_change = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "Diabetic", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]
d = dict()
for c in column_to_change:
    d[c] = {"No": 0, "Yes": 1, "No, borderline diabetes": 0, "Yes (during pregnancy)": 0}
df = df.replace(d)
df.head()

### Change categoricals columns

In [None]:
heart_orig = df

In [None]:
# We can find 4 more categorical variables which are not yes/no. The dtype is object for these 4. Let us convert them into dummy variables.
categoricals = df.select_dtypes(include=['object'])
categoricals.head()
cat_dummies = pd.get_dummies(categoricals)
cat_dummies.head()
# Drop the redundant columns
df.drop(list(categoricals.columns), axis=1, inplace=True)
# concat the heart and dummies data frames.
df = pd.concat([df, cat_dummies], axis=1)

In [None]:
df.head(100)

# Analyse

## Correlation

### With categorical data splitted

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(df.corr(),annot = True)
plt.show()

### Without categorical data

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(heart_orig.corr(),annot = True)
plt.show()

# Setup for models

## Splitting dataset

In [None]:
# Here we are interested in the chance of having the heart disease based on other variables.
# Let y be HeartDisease column
y = df.pop('HeartDisease')
X = df
X.head()

In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
X_train.head()

In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
X_train.head()

## Scale

In [None]:
# Scale X variables - should have done before spliting
from sklearn.preprocessing import scale
# rescale the features of Xtrain
cols = X_train.columns
X_train = pd.DataFrame(scale(X_train))
X_train.columns = cols
# rescale the features of Xtest
cols = X_test.columns
X_test = pd.DataFrame(scale(X_test))
X_test.columns = cols

# not optimized models

## XGBoost Classifier with Bagging and Boosting

In [None]:
import xgboost as xgb
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score


# Init classifier
xgb_cl = xgb.XGBClassifier()

# The baggging ensemble classifier is initialized with:

bagging = BaggingClassifier(base_estimator=xgb_cl, n_estimators=5, max_samples=50, bootstrap=True)

# Training
bagging.fit(X_train, y_train)

# Evaluating
print(f"Train score: {bagging.score(X_train, y_train)}")
print(f"Test score: {bagging.score(X_test, y_test)}")

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


# The base learner will be a decision tree with depth = 2
xgb_cl = xgb.XGBClassifier()

adaboost = AdaBoostClassifier(base_estimator=xgb_cl, n_estimators=5, learning_rate=0.1, random_state=23)

# Train!
adaboost.fit(X_train, y_train)

# Evaluation
print(f"Train score: {adaboost.score(X_train, y_train)}")
print(f"Test score: {adaboost.score(X_test, y_test)}")

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, preds))

## Decision Tree Classifier with Bagging and Boosting

In [None]:
from sklearn.tree import DecisionTreeClassifier 
# Import Decision Tree Classifier
from sklearn.ensemble import BaggingClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics 


tree = DecisionTreeClassifier(max_depth=3, random_state=23)

# The baggging ensemble classifier is initialized with:

bagging = BaggingClassifier(base_estimator=tree, n_estimators=5, max_samples=50, bootstrap=True)

# Training
bagging.fit(X_train, y_train)

# Evaluating
print(f"Train score: {bagging.score(X_train, y_train)}")
print(f"Test score: {bagging.score(X_test, y_test)}")

# Train Decision Tree Classifer
tree = tree.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = tree.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# The base learner will be a decision tree with depth = 2
tree = DecisionTreeClassifier(max_depth=2, random_state=23)


adaboost = AdaBoostClassifier(base_estimator=tree, n_estimators=5, learning_rate=0.1, random_state=23)

# Train!
adaboost.fit(X_train, y_train)

# Evaluation
print(f"Train score: {adaboost.score(X_train, y_train)}")
print(f"Test score: {adaboost.score(X_test, y_test)}")

# Train Decision Tree Classifer
tree = tree.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = tree.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## Naive Bayes Classifier

In [None]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print ("Accuracy : ", accuracy_score(y_test, y_pred))

## K Neighbours Classifier

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
 
# knn = KNeighborsClassifier(n_neighbors=8)
# knn.fit(X_train, y_train)
# y_pred = knn.predict(X_test)
# print(accuracy_score(y_test, y_pred))

## Which models are promising?