# JONBUSUG

## Import All Library Needed

In [1]:
import numpy as np 
import random
import pandas as pd 
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import itertools

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn import tree
from sklearn.impute import SimpleImputer

from sklearn import svm, datasets
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from pprint import pprint

%matplotlib inline

## Read Data From CSV

In [2]:
heart_train = pd.read_csv('tubes2_HeartDisease_train.csv')
heart_test = pd.read_csv('tubes2_HeartDisease_test.csv')

## Rename Column Names

In [3]:
from copy import deepcopy

test_columns = {
    'Column1': 'age',
    'Column2': 'sex',
    'Column3': 'chest_pain_type',
    'Column4': 'resting_blood_pressure',
    'Column5': 'serum_cholesterol',
    'Column6': 'fasting_blood_sugar',
    'Column7': 'resting_ecg',
    'Column8': 'max_heart_rate_achieved',
    'Column9': 'exercise_induced_angina',
    'Column10': 'st_depression',
    'Column11': 'peak_exercise_st_segment',
    'Column12': 'num_major_flourosopy',
    'Column13': 'thal'
}
train_columns = test_columns.copy()
train_columns['Column14'] = 'heart_disease_diagnosis'
# Rename columns
heart_train = heart_train.rename(columns=train_columns)
heart_test = heart_test.rename(columns=test_columns)

## Data Analysis

In [4]:
print('Column data heart train')
pprint(heart_train.dtypes)
print()
print('Show heart train head')
pprint(heart_train.head())
print()
print('Find sum value undefined in each column')
heart_train.isna().sum()

Column data heart train
age                          int64
sex                          int64
chest_pain_type              int64
resting_blood_pressure      object
serum_cholesterol           object
fasting_blood_sugar         object
resting_ecg                 object
max_heart_rate_achieved     object
exercise_induced_angina     object
st_depression               object
peak_exercise_st_segment    object
num_major_flourosopy        object
thal                        object
heart_disease_diagnosis      int64
dtype: object

Show heart train head
   age  sex  chest_pain_type resting_blood_pressure serum_cholesterol  \
0   54    1                4                    125               216   
1   55    1                4                    158               217   
2   54    0                3                    135               304   
3   48    0                3                    120               195   
4   50    1                4                    120                 0   

  fasting_

age                         0
sex                         0
chest_pain_type             0
resting_blood_pressure      0
serum_cholesterol           0
fasting_blood_sugar         0
resting_ecg                 1
max_heart_rate_achieved     0
exercise_induced_angina     0
st_depression               0
peak_exercise_st_segment    0
num_major_flourosopy        0
thal                        0
heart_disease_diagnosis     0
dtype: int64

Berdasarkan dengan beberapa pengecekan di atas, dapat dilihat bahwa data pada csv yang diberikan:
1. Tidak semua data bertipe numerik
2. Ada beberapa data yang bernilai '?'
3. Ada data yang bernilai NaN (undefined)

Hal tersebut dapat mengganggu proses pemodelan. Oleh karena itu perlu dilakukan pre-processing sebagai berikut :

## Dataframe Conversion to Numeric 

In [5]:
# Convert string to numeric, convert non-number to NAN
heart_train = heart_train.apply(pd.to_numeric, errors = 'coerce')
# heart_train = heart_train.replace('?', np.NaN)

print('Data type of columns after conversion')
print(heart_train.dtypes)
print()

# NaN count
print('Total value NaN after heart_train converted to numeric value')
print(heart_train.isna().sum())

Data type of columns after conversion
age                           int64
sex                           int64
chest_pain_type               int64
resting_blood_pressure      float64
serum_cholesterol           float64
fasting_blood_sugar         float64
resting_ecg                 float64
max_heart_rate_achieved     float64
exercise_induced_angina     float64
st_depression               float64
peak_exercise_st_segment    float64
num_major_flourosopy        float64
thal                        float64
heart_disease_diagnosis       int64
dtype: object

Total value NaN after heart_train converted to numeric value
age                           0
sex                           0
chest_pain_type               0
resting_blood_pressure       47
serum_cholesterol            24
fasting_blood_sugar          78
resting_ecg                   2
max_heart_rate_achieved      44
exercise_induced_angina      44
st_depression                49
peak_exercise_st_segment    262
num_major_flourosopy        51

#### Menghilangkan nilai NaN

Pada pre-processingnya, konversi data dari object (string) menjadi numerik berhasil menghilagkan tipe objek dari dataframe. Namun, untuk value yang tidak dapat dikonversi menjadi angka akan bernilai NaN yang membuat dataframe tidak bisa diolah. 

Salah satu cara termudah untuk menghilangkan nilai NaN adalah dengan cara menghapus row yang mengandung nilai tersebut, namun melihat pada column 12 terdapat 514 row yang bernilai NaN, cara ini tidak feasible karena akan sangat mengurangi data training. Oleh karena itum, kami memutuskan untuk me-replace nilai NaN dengan median. Pemilihan median dilakukan karena nilai median lebih stabil dibanding mean terhadap data-data iregular (outliers).

Akan tetapi, pengisian data yang bersifat categorical tidak bisa dilakukan dengan nilai median karena akan menghasilkan nilai yang tidak bermakna (bukan termasuk kategori yang ada). Oleh karena itu, khusus untuk atribut-atribut yang bersifat kategori, digunakan modus untuk mengganti nilai-nilai yang ilang.

Terakhir, untuk mengurangi noise pada data, dilakukan penghapusan baris-baris yang memiliki nilai NaN pada lebih dari 3 atribut.

In [1]:
# Drop row with too many NaNs
heart_train = heart_train.dropna(thresh=10)

# # Fill NaN, median for continuous data, mode for categorical data
heart_train['thal'].fillna(heart_train['thal'].mode()[0], inplace=True)
heart_train['peak_exercise_st_segment'].fillna(heart_train['peak_exercise_st_segment'].mode()[0], inplace=True)
heart_train['chest_pain_type'].fillna(heart_train['chest_pain_type'].mode()[0], inplace=True)
heart_train['resting_ecg'].fillna(heart_train['resting_ecg'].mode()[0], inplace=True)
heart_train['fasting_blood_sugar'].fillna(heart_train['fasting_blood_sugar'].mode()[0], inplace=True)

imp = SimpleImputer(missing_values=np.nan, strategy='median')

c = heart_train.columns
heart_train = pd.DataFrame(imp.fit_transform(heart_train))
heart_train.columns = c

# Count NaN value
print('Total NaN Value')
print(heart_train.isna().sum())

NameError: name 'heart_train' is not defined

# Splitting of Categorical Data

Pada beberapa data yang bersifat kategori, kategori-kategori yang mungkin direpresentasikan dengan nilai 1-X. akan tetapi, nilai-nilai tersebut mengimplikasikan adanya ordering antara kategori-kategori tersebut (1 < 2, berarti kategori 1 lebih baik/buruk dari kategori 2). Oleh karena itu, pada atribut-atribut tertentu, dilakukan pemecahan atribut menjadi X atribut binary baru, dengan X berupa jumlah kategori yang mungkin untuk atribut tersebut.

In [8]:
def upsloping(row):
    if row["peak_exercise_st_segment"] == 1: return 1
    else: return 0

def flat(row):
    if row["peak_exercise_st_segment"] == 2: return 1
    else: return 0
    
def downsloping(row):
    if row["peak_exercise_st_segment"] == 3: return 1
    else: return 0
    
heart_train['upsloping'] = heart_train.apply(lambda row: row["peak_exercise_st_segment"] == 1, axis=1).astype(int)
heart_train['flat'] = heart_train.apply(lambda row: row["peak_exercise_st_segment"] == 2, axis=1).astype(int)
heart_train['downsloping'] = heart_train.apply(lambda row: row["peak_exercise_st_segment"] == 3, axis=1).astype(int)
heart_train = heart_train.drop('peak_exercise_st_segment', axis=1)
heart_train

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,num_major_flourosopy,thal,heart_disease_diagnosis,upsloping,flat,downsloping
0,54.0,1.0,4.0,125.0,216.0,0.0,0.0,140.0,0.0,0.0,0.0,3.0,1.0,0,1,0
1,55.0,1.0,4.0,158.0,217.0,0.0,0.0,110.0,1.0,2.5,0.0,3.0,1.0,0,1,0
2,54.0,0.0,3.0,135.0,304.0,1.0,0.0,170.0,0.0,0.0,0.0,3.0,0.0,1,0,0
3,48.0,0.0,3.0,120.0,195.0,0.0,0.0,125.0,0.0,0.0,0.0,3.0,0.0,0,1,0
4,50.0,1.0,4.0,120.0,0.0,0.0,1.0,156.0,1.0,0.0,0.0,6.0,3.0,1,0,0
5,64.0,0.0,4.0,130.0,303.0,0.0,0.0,122.0,0.0,2.0,2.0,3.0,0.0,0,1,0
6,63.0,1.0,4.0,130.0,308.0,0.0,0.0,138.0,1.0,2.0,0.0,3.0,2.0,0,1,0
7,58.0,1.0,2.0,130.0,251.0,0.0,0.0,110.0,0.0,0.0,0.0,3.0,0.0,0,1,0
8,42.0,1.0,2.0,150.0,268.0,0.0,0.0,136.0,0.0,0.0,0.0,3.0,0.0,0,1,0
9,54.0,1.0,3.0,120.0,258.0,0.0,2.0,147.0,0.0,4.0,0.0,7.0,0.0,0,1,0


# Binarizing Data

Untuk data-data seperti resting blood pressure dan resting ecg, dilakukan binarisasi data (mengubah data menjadi nilai 1 atau 0). Hal ini dilakukan karena kedua kategori tersebut bisa disimplifikasi menjadi kategori sehat atau tidak (1 atau 0). Hal ini membuat data lebih simpel ketimbang sebelum dilakukan binarisasi, misalkan, untuk resting blood pressure, tanpa binarisasi, model-model yang dilatih harus bisa menemukan weight yang bisa memisahkan antara nilai pada range [120, 140] dan sisanya.

In [2]:
# Binarize resting_ecg, normal as 0 and everything else as 1
heart_train['resting_ecg'] = (heart_train['resting_ecg'] >= 1).astype(int)

NameError: name 'heart_train' is not defined

In [3]:
# Binarize resting blood pressure to 0 for [120, 140] (healthy) and 1 otherwise
heart_train['resting_blood_pressure'] = ((heart_train['resting_blood_pressure'] > 140) | (heart_train['resting_blood_pressure'] < 120)).astype(int)

NameError: name 'heart_train' is not defined

In [12]:
# Split data train
# heart_train_copy = heart_train.copy()
Y = heart_train['heart_disease_diagnosis']
X = heart_train.drop('heart_disease_diagnosis', axis = 1)

# scaler = StandardScaler().fit(X)
# X = scaler.transform(X)

# Best so far
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

KF = KFold(10, shuffle=True)

  return self.partial_fit(X, y)


## Gaussian Naive-Bayes

In [13]:
gnb = GaussianNB()

results = cross_validate(gnb, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

{'fit_time': array([0.00161839, 0.00110126, 0.00108171, 0.0009706 , 0.00095749,
       0.00097084, 0.00098014, 0.00096345, 0.00116539, 0.00093818]),
 'score_time': array([0.00052309, 0.0004673 , 0.00043941, 0.00041103, 0.00042486,
       0.00043273, 0.00047517, 0.00042558, 0.000458  , 0.00042772]),
 'test_score': array([0.51315789, 0.56      , 0.54666667, 0.59459459, 0.60273973,
       0.50684932, 0.63013699, 0.5890411 , 0.57746479, 0.6056338 ]),
 'train_score': array([0.61246201, 0.61456753, 0.60849772, 0.6030303 , 0.602118  ,
       0.62934947, 0.60816944, 0.61422088, 0.61085973, 0.60633484])}
0.572628487083507


## Decision Tree

In [14]:
dt = tree.DecisionTreeClassifier()

result = cross_validate(dt, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

{'fit_time': array([0.00161839, 0.00110126, 0.00108171, 0.0009706 , 0.00095749,
       0.00097084, 0.00098014, 0.00096345, 0.00116539, 0.00093818]),
 'score_time': array([0.00052309, 0.0004673 , 0.00043941, 0.00041103, 0.00042486,
       0.00043273, 0.00047517, 0.00042558, 0.000458  , 0.00042772]),
 'test_score': array([0.51315789, 0.56      , 0.54666667, 0.59459459, 0.60273973,
       0.50684932, 0.63013699, 0.5890411 , 0.57746479, 0.6056338 ]),
 'train_score': array([0.61246201, 0.61456753, 0.60849772, 0.6030303 , 0.602118  ,
       0.62934947, 0.60816944, 0.61422088, 0.61085973, 0.60633484])}
0.572628487083507


## KNN

In [15]:
knn = KNeighborsClassifier(n_neighbors=44, weights='distance')

results = cross_validate(knn, X, Y, cv=5)
pprint(results)

print(sum(results['test_score'])/5)

{'fit_time': array([0.00136137, 0.00083447, 0.00077844, 0.0007565 , 0.00121856]),
 'score_time': array([0.0043366 , 0.00436759, 0.0043087 , 0.00403333, 0.00603867]),
 'test_score': array([0.59060403, 0.59459459, 0.62162162, 0.62068966, 0.60416667]),
 'train_score': array([1., 1., 1., 1., 1.])}
0.6063353129801868


## MLP Classifier

In [16]:
mlp = MLPClassifier(hidden_layer_sizes=(2), solver='sgd', 
                    max_iter=1000, learning_rate_init=0.1, learning_rate='adaptive',
                    activation='identity')

results = cross_validate(mlp, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

{'fit_time': array([0.18589592, 0.22063279, 0.2123282 , 0.18828845, 0.20628476,
       0.21786213, 0.19041324, 0.21068096, 0.19419718, 0.20564437]),
 'score_time': array([0.00043368, 0.00041866, 0.00040603, 0.00040078, 0.00063062,
       0.00039911, 0.00033283, 0.00039577, 0.00040531, 0.0004015 ]),
 'test_score': array([0.63157895, 0.62666667, 0.58666667, 0.63513514, 0.57534247,
       0.56164384, 0.64383562, 0.54794521, 0.52112676, 0.63380282]),
 'train_score': array([0.62006079, 0.60394537, 0.61760243, 0.61212121, 0.60816944,
       0.6172466 , 0.61875946, 0.61875946, 0.61387632, 0.6199095 ])}
0.5963744116589348
