# Assignments IF3170 - Artificial Intelligent
## Web Application for classifying hearth disease from data clinic 


## Phase A - Find Best Model

#### Group 3 - Unexpected
    - Rizki Alif Salman Alfarisy / 13516005
    - Jonathan Alvaro / 13516023
    - Joseph Salimin / 13516037
    - Kevin Leonardo Limitius / 13516049
    - Kevin Basuki / 13516071

### First of All.. Import All Library Needed

In [4]:
import numpy as np 
import random
import pandas as pd 
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import itertools

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn import tree
from sklearn.impute import SimpleImputer

from sklearn import svm, datasets
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.externals import joblib

from pprint import pprint

%matplotlib inline

### Read Data From CSV

In [5]:
heart_train = pd.read_csv('tubes2_HeartDisease_train.csv')
heart_test = pd.read_csv('tubes2_HeartDisease_test.csv')

### Rename Column Names

In [6]:
from copy import deepcopy

test_columns = {
    'Column1': 'age',
    'Column2': 'sex',
    'Column3': 'chest_pain_type',
    'Column4': 'resting_blood_pressure',
    'Column5': 'serum_cholesterol',
    'Column6': 'fasting_blood_sugar',
    'Column7': 'resting_ecg',
    'Column8': 'max_heart_rate_achieved',
    'Column9': 'exercise_induced_angina',
    'Column10': 'st_depression',
    'Column11': 'peak_exercise_st_segment',
    'Column12': 'num_major_flourosopy',
    'Column13': 'thal'
}
# Create train columns
train_columns = test_columns.copy()
train_columns['Column14'] = 'heart_disease_diagnosis'

# Rename columns
heart_train = heart_train.rename(columns=train_columns)
heart_test = heart_test.rename(columns=test_columns)

### Data Analysis

In [7]:
print('Column data heart train')
pprint(heart_train.dtypes)
print()
print('Show heart train head')
pprint(heart_train.head())
print()
print('Find sum value undefined in each column')
heart_train.isna().sum()

Column data heart train
age                          int64
sex                          int64
chest_pain_type              int64
resting_blood_pressure      object
serum_cholesterol           object
fasting_blood_sugar         object
resting_ecg                 object
max_heart_rate_achieved     object
exercise_induced_angina     object
st_depression               object
peak_exercise_st_segment    object
num_major_flourosopy        object
thal                        object
heart_disease_diagnosis      int64
dtype: object

Show heart train head
   age  sex  chest_pain_type resting_blood_pressure serum_cholesterol  \
0   54    1                4                    125               216   
1   55    1                4                    158               217   
2   54    0                3                    135               304   
3   48    0                3                    120               195   
4   50    1                4                    120                 0   

  fasting_

age                         0
sex                         0
chest_pain_type             0
resting_blood_pressure      0
serum_cholesterol           0
fasting_blood_sugar         0
resting_ecg                 1
max_heart_rate_achieved     0
exercise_induced_angina     0
st_depression               0
peak_exercise_st_segment    0
num_major_flourosopy        0
thal                        0
heart_disease_diagnosis     0
dtype: int64

From above results, we can conclude that data given from CSV:
1. Not all data is numeric
2. There are some datas which value is '?'
3. There are also some datas which value is undefined (NaN)

**Our conclusion**: 

Results above can disturb the modeling process. Because of that, we need to do some pre-processing to ensure that there are little to no noise in the model.

## Dataframe Conversion to Numeric 

In [9]:
# Convert string to numeric
heart_train = heart_train.apply(pd.to_numeric, errors = 'coerce')
# Print data
print('Data type of columns after conversion')
print(heart_train.dtypes)
print()
# Show NaN count
print('Total value NaN after heart_train converted to numeric value')
print(heart_train.isna().sum())

Data type of columns after conversion
age                           int64
sex                           int64
chest_pain_type               int64
resting_blood_pressure      float64
serum_cholesterol           float64
fasting_blood_sugar         float64
resting_ecg                 float64
max_heart_rate_achieved     float64
exercise_induced_angina     float64
st_depression               float64
peak_exercise_st_segment    float64
num_major_flourosopy        float64
thal                        float64
heart_disease_diagnosis       int64
dtype: object

Total value NaN after heart_train converted to numeric value
age                           0
sex                           0
chest_pain_type               0
resting_blood_pressure       47
serum_cholesterol            24
fasting_blood_sugar          78
resting_ecg                   2
max_heart_rate_achieved      44
exercise_induced_angina      44
st_depression                49
peak_exercise_st_segment    262
num_major_flourosopy        51

#### Removing NaN Values

In the process above, first, we convert the data from object (string) to numeric value. We succeed in removing object value from dataframe. But there are also a problem, value which can not be convered to numeric data type will be converted to NaN and that problem can make dataframe hard to be processed.

One of the easiest way to remove NaN value is to remove row which contains NaN value in it. But that way is not really good and not feasible, since column 12 has 514 rows which value is NaN and that means removing 514 rows which will reduce many data trainings. Another way is to replace NaN value with median. We choose median value rather than mean because median value is much more stable thant mean for irregular data (outliers).

For categorical data, for example 
Akan tetapi, pengisian data yang bersifat categorical tidak bisa dilakukan dengan nilai median karena akan menghasilkan nilai yang tidak bermakna (bukan termasuk kategori yang ada). Oleh karena itu, khusus untuk atribut-atribut yang bersifat kategori, digunakan modus untuk mengganti nilai-nilai yang ilang.

Terakhir, untuk mengurangi noise pada data, dilakukan penghapusan baris-baris yang memiliki nilai NaN pada lebih dari 3 atribut.

In [None]:
# Drop row with too many NaNs
heart_train = heart_train.dropna(thresh=10)

# # Fill NaN, median for continuous data, mode for categorical data
heart_train['thal'].fillna(heart_train['thal'].mode()[0], inplace=True)
heart_train['peak_exercise_st_segment'].fillna(heart_train['peak_exercise_st_segment'].mode()[0], inplace=True)
heart_train['chest_pain_type'].fillna(heart_train['chest_pain_type'].mode()[0], inplace=True)
heart_train['resting_ecg'].fillna(heart_train['resting_ecg'].mode()[0], inplace=True)
heart_train['fasting_blood_sugar'].fillna(heart_train['fasting_blood_sugar'].mode()[0], inplace=True)

imp = SimpleImputer(missing_values=np.nan, strategy='median')

c = heart_train.columns
heart_train = pd.DataFrame(imp.fit_transform(heart_train))
heart_train.columns = c

# Count NaN value
print('Total NaN Value')
print(heart_train.isna().sum())

# Splitting of Categorical Data

Pada beberapa data yang bersifat kategori, kategori-kategori yang mungkin direpresentasikan dengan nilai 1-X. akan tetapi, nilai-nilai tersebut mengimplikasikan adanya ordering antara kategori-kategori tersebut (1 < 2, berarti kategori 1 lebih baik/buruk dari kategori 2). Oleh karena itu, pada atribut-atribut tertentu, dilakukan pemecahan atribut menjadi X atribut binary baru, dengan X berupa jumlah kategori yang mungkin untuk atribut tersebut.

In [None]:
def upsloping(row):
    if row["peak_exercise_st_segment"] == 1: return 1
    else: return 0

def flat(row):
    if row["peak_exercise_st_segment"] == 2: return 1
    else: return 0
    
def downsloping(row):
    if row["peak_exercise_st_segment"] == 3: return 1
    else: return 0
    
heart_train['upsloping'] = heart_train.apply(lambda row: row["peak_exercise_st_segment"] == 1, axis=1).astype(int)
heart_train['flat'] = heart_train.apply(lambda row: row["peak_exercise_st_segment"] == 2, axis=1).astype(int)
heart_train['downsloping'] = heart_train.apply(lambda row: row["peak_exercise_st_segment"] == 3, axis=1).astype(int)
heart_train = heart_train.drop('peak_exercise_st_segment', axis=1)
heart_train

# Binarizing Data

Untuk data-data seperti resting blood pressure dan resting ecg, dilakukan binarisasi data (mengubah data menjadi nilai 1 atau 0). Hal ini dilakukan karena kedua kategori tersebut bisa disimplifikasi menjadi kategori sehat atau tidak (1 atau 0). Hal ini membuat data lebih simpel ketimbang sebelum dilakukan binarisasi, misalkan, untuk resting blood pressure, tanpa binarisasi, model-model yang dilatih harus bisa menemukan weight yang bisa memisahkan antara nilai pada range [120, 140] dan sisanya.

In [None]:
# Binarize resting_ecg, normal as 0 and everything else as 1
heart_train['resting_ecg'] = (heart_train['resting_ecg'] >= 1).astype(int)

In [None]:
# Binarize resting blood pressure to 0 for [120, 140] (healthy) and 1 otherwise
heart_train['resting_blood_pressure'] = ((heart_train['resting_blood_pressure'] > 140) | (heart_train['resting_blood_pressure'] < 120)).astype(int)

In [None]:
# Split data train
# heart_train_copy = heart_train.copy()
Y = heart_train['heart_disease_diagnosis']
X = heart_train.drop('heart_disease_diagnosis', axis = 1)

# scaler = StandardScaler().fit(X)
# X = scaler.transform(X)

# Best so far
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

KF = KFold(10, shuffle=True)

## Gaussian Naive-Bayes

In [None]:
gnb = GaussianNB()

results = cross_validate(gnb, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

In [None]:
i=1
sum_acc = 0
sum_prec = 0
sum_rec = 0
for trainidx, testidx in KF.split(X):
    gnb = GaussianNB()
    
    X_train, X_test = X[trainidx], X[testidx]
    Y_train, Y_test = Y[trainidx], Y[testidx]
    gnb.fit(X_train,Y_train)

    accuration = metrics.accuracy_score(Y_test, gnb.predict(X_test))
    precision = metrics.precision_score(Y_test, gnb.predict(X_test), average="macro")
    recall = metrics.recall_score(Y_test, gnb.predict(X_test), average="macro")
    
    i+=1
    sum_acc += accuration
    sum_prec += precision
    sum_rec += recall

results = cross_validate(gnb, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

print("Average Accuration : {0:.4f}".format(sum_acc/10))
print("Average Precision : {0:.4f}".format(sum_prec/10))
print("Average Recall : {0:.4f}".format(sum_rec/10))

## Decision Tree

In [None]:
dt = tree.DecisionTreeClassifier()

results = cross_validate(dt, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

In [None]:
i=1
sum_acc = 0
sum_prec = 0
sum_rec = 0
for trainidx, testidx in KF.split(X):
    dt = tree.DecisionTreeClassifier()
    
    X_train, X_test = X[trainidx], X[testidx]
    Y_train, Y_test = Y[trainidx], Y[testidx]
    dt.fit(X_train,Y_train)

    accuration = metrics.accuracy_score(Y_test, dt.predict(X_test))
    precision = metrics.precision_score(Y_test, dt.predict(X_test), average="macro")
    recall = metrics.recall_score(Y_test, dt.predict(X_test), average="macro")
    
    i+=1
    sum_acc += accuration
    sum_prec += precision
    sum_rec += recall
    
results = cross_validate(dt, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

print("Average Accuration : {0:.4f}".format(sum_acc/10))
print("Average Precision : {0:.4f}".format(sum_prec/10))
print("Average Recall : {0:.4f}".format(sum_rec/10))

## KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=44, weights='distance')

results = cross_validate(knn, X, Y, cv=5)
pprint(results)

print(sum(results['test_score'])/5)

In [None]:
i=1
sum_acc = 0
sum_prec = 0
sum_rec = 0
for trainidx, testidx in KF.split(X):
    knn = KNeighborsClassifier(n_neighbors=44, weights='distance')
    
    X_train, X_test = X[trainidx], X[testidx]
    Y_train, Y_test = Y[trainidx], Y[testidx]
    knn.fit(X_train,Y_train)

    accuration = metrics.accuracy_score(Y_test, knn.predict(X_test))
    precision = metrics.precision_score(Y_test, knn.predict(X_test), average="macro")
    recall = metrics.recall_score(Y_test, knn.predict(X_test), average="macro")
    
    i+=1
    sum_acc += accuration
    sum_prec += precision
    sum_rec += recall
    
results = cross_validate(knn, X, Y, cv=5)
pprint(results)

print(sum(results['test_score'])/5)
    
print("Average Accuration : {0:.4f}".format(sum_acc/10))
print("Average Precision : {0:.4f}".format(sum_prec/10))
print("Average Recall : {0:.4f}".format(sum_rec/10))

## MLP Classifier

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(2), solver='sgd', 
                    max_iter=1000, learning_rate_init=0.1, learning_rate='adaptive',
                    activation='identity')

results = cross_validate(mlp, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

In [None]:
i=1
sum_acc = 0
sum_prec = 0
sum_rec = 0
for trainidx, testidx in KF.split(X):
    mlp = MLPClassifier(hidden_layer_sizes=(2), solver='sgd', 
                        max_iter=1000, learning_rate_init=0.1, learning_rate='adaptive',
                        activation='identity')
    
    X_train, X_test = X[trainidx], X[testidx]
    Y_train, Y_test = Y[trainidx], Y[testidx]
    mlp.fit(X_train,Y_train)

    accuration = metrics.accuracy_score(Y_test, mlp.predict(X_test))
    precision = metrics.precision_score(Y_test, mlp.predict(X_test), average="macro")
    recall = metrics.recall_score(Y_test, mlp.predict(X_test), average="macro")
    
    i+=1
    sum_acc += accuration
    sum_prec += precision
    sum_rec += recall
    
results = cross_validate(mlp, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)
    
print("Average Accuration : {0:.4f}".format(sum_acc/10))
print("Average Precision : {0:.4f}".format(sum_prec/10))
print("Average Recall : {0:.4f}".format(sum_rec/10))

## Save model

In [None]:
# Save model using Joblib
# Pilih salah satu yg terbaik

#joblib.dump(gnb, 'naivebayes.pkl')
#joblib.dump(tree, 'decisiontree.pkl')
joblib.dump(knn, 'knn.pkl')
#joblib.dump(mlp, 'mlp.pkl')

## Load model

In [None]:
# Load model using Joblib
# Pilih salah satu yg terbaik

#gnb = joblib.load('naivebayes.pkl')
#tree = joblib.load('decisiontree.pkl')
knn = joblib.load('knn.pkl')
#mlp = joblib.load('mlp.pkl')

## Evaluasi Model