# JONBUSUG

## Import All Library Needed

In [1]:
import numpy as np 
import random
import pandas as pd 
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import itertools

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn import tree
from sklearn.impute import SimpleImputer

from sklearn import svm, datasets
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from pprint import pprint

%matplotlib inline

## Read Data From CSV

In [2]:
heart_train = pd.read_csv('tubes2_HeartDisease_train.csv')
heart_test = pd.read_csv('tubes2_HeartDisease_test.csv')

## Rename Column Names

In [3]:
from copy import deepcopy

test_columns = {
    'Column1': 'age',
    'Column2': 'sex',
    'Column3': 'chest_pain_type',
    'Column4': 'resting_blood_pressure',
    'Column5': 'serum_cholesterol',
    'Column6': 'fasting_blood_sugar',
    'Column7': 'resting_ecg',
    'Column8': 'max_heart_rate_achieved',
    'Column9': 'exercise_induced_angina',
    'Column10': 'st_depression',
    'Column11': 'peak_exercise_st_segment',
    'Column12': 'num_major_flourosopy',
    'Column13': 'thal'
}
train_columns = test_columns.copy()
train_columns['Column14'] = 'heart_disease_diagnosis'
# Rename columns
heart_train = heart_train.rename(columns=train_columns)
heart_test = heart_test.rename(columns=test_columns)

## Data Analysis

In [4]:
print('Column data heart train')
pprint(heart_train.dtypes)
print()
print('Show heart train head')
pprint(heart_train.head())
print()
print('Find sum value undefined in each column')
heart_train.isna().sum()

Column data heart train
age                          int64
sex                          int64
chest_pain_type              int64
resting_blood_pressure      object
serum_cholesterol           object
fasting_blood_sugar         object
resting_ecg                 object
max_heart_rate_achieved     object
exercise_induced_angina     object
st_depression               object
peak_exercise_st_segment    object
num_major_flourosopy        object
thal                        object
heart_disease_diagnosis      int64
dtype: object

Show heart train head
   age  sex  chest_pain_type resting_blood_pressure serum_cholesterol  \
0   54    1                4                    125               216   
1   55    1                4                    158               217   
2   54    0                3                    135               304   
3   48    0                3                    120               195   
4   50    1                4                    120                 0   

  fasting_

age                         0
sex                         0
chest_pain_type             0
resting_blood_pressure      0
serum_cholesterol           0
fasting_blood_sugar         0
resting_ecg                 1
max_heart_rate_achieved     0
exercise_induced_angina     0
st_depression               0
peak_exercise_st_segment    0
num_major_flourosopy        0
thal                        0
heart_disease_diagnosis     0
dtype: int64

Berdasarkan dengan beberapa pengecekan di atas, dapat dilihat bahwa data pada csv yang diberikan:
1. Tidak semua data bertipe numerik
2. Ada beberapa data yang bernilai '?'
3. Ada data yang bernilai NaN (undefined)

Hal tersebut dapat mengganggu proses pemodelan. Oleh karena itu perlu dilakukan pre-processing sebagai berikut :

## Dataframe Conversion to Numeric 

In [5]:
# Convert string to numeric, convert non-number to NAN
# heart_train = heart_train.apply(pd.to_numeric, errors = 'coerce')
heart_train = heart_train.replace('?', np.NaN)
heart_train = heart_train.dropna(thresh=9)

print('Data type of columns after conversion')
print(heart_train.dtypes)
print()

# NaN count
print('Total value NaN after heart_train converted to numeric value')
print(heart_train.isna().sum())

Data type of columns after conversion
age                          int64
sex                          int64
chest_pain_type              int64
resting_blood_pressure      object
serum_cholesterol           object
fasting_blood_sugar         object
resting_ecg                 object
max_heart_rate_achieved     object
exercise_induced_angina     object
st_depression               object
peak_exercise_st_segment    object
num_major_flourosopy        object
thal                        object
heart_disease_diagnosis      int64
dtype: object

Total value NaN after heart_train converted to numeric value
age                           0
sex                           0
chest_pain_type               0
resting_blood_pressure        4
serum_cholesterol            21
fasting_blood_sugar          78
resting_ecg                   2
max_heart_rate_achieved       1
exercise_induced_angina       1
st_depression                 6
peak_exercise_st_segment    219
num_major_flourosopy        471
thal        

#### Menghilangkan nilai NaN

Pada pre-processingnya, konversi data dari object (string) menjadi numerik berhasil menghilagkan tipe objek dari dataframe. Namun, untuk value yang tidak dapat dikonversi menjadi angka akan bernilai NaN yang membuat dataframe tidak bisa diolah. 

Salah satu cara termudah untuk menghilangkan nilai NaN adalah dengan cara menghapus row yang mengandung nilai tersebut, namun melihat pada column 12 terdapat 514 row yang bernilai NaN, cara ini tidak feasible karena akan sangat mengurangi data training. Oleh karena itum, kami memutuskan untuk me-replace nilai NaN dengan XXXXXX. Pemilihan XXXXXX dilakukan karena XXXXXXXXXXXXXXXXXXXXXXXXXXXX

In [6]:
# Drop num_major_flouroscopy, too many NaNs
heart_train = heart_train.drop('num_major_flourosopy', axis=1)
heart_train['thal'].fillna(heart_train['thal'].mode()[0], inplace=True)
heart_train['peak_exercise_st_segment'].fillna(heart_train['peak_exercise_st_segment'].mode()[0], inplace=True)
heart_train['chest_pain_type'].fillna(heart_train['chest_pain_type'].mode()[0], inplace=True)
heart_train['resting_ecg'].fillna(heart_train['resting_ecg'].mode()[0], inplace=True)
heart_train['fasting_blood_sugar'].fillna(heart_train['fasting_blood_sugar'].mode()[0], inplace=True)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

c = heart_train.columns
heart_train = pd.DataFrame(imp.fit_transform(heart_train))
heart_train.columns = c

# median = heart_train.median(axis=0)
# for idx, column in enumerate(heart_train.columns):
#     heart_train[column] = heart_train[column].replace(np.NaN, median[idx])

# Count NaN value
print('Total NaN Value')
print(heart_train.isna().sum())

Total NaN Value
age                         0
sex                         0
chest_pain_type             0
resting_blood_pressure      0
serum_cholesterol           0
fasting_blood_sugar         0
resting_ecg                 0
max_heart_rate_achieved     0
exercise_induced_angina     0
st_depression               0
peak_exercise_st_segment    0
thal                        0
heart_disease_diagnosis     0
dtype: int64


In [7]:
def label_chest_pain(row):
    if row['chest_pain_type'] == 1: return 1
    else: return 0

def label_chest_pain_2(row):
    if row['chest_pain_type'] == 2: return 1
    else: return 0
    
def label_chest_pain_3(row):
    if row['chest_pain_type'] == 3: return 1
    else: return 0
    
def label_chest_pain_4(row):
    if row['chest_pain_type'] == 4: return 1
    else: return 0

heart_train['typical_angina'] = heart_train.apply(lambda row: label_chest_pain(row), axis=1)
heart_train['atypical_angina'] = heart_train.apply(lambda row: label_chest_pain_2(row), axis=1)
heart_train['non_anginal_pain'] = heart_train.apply(lambda row: label_chest_pain_3(row), axis=1)
heart_train['asymptotic'] = heart_train.apply(lambda row: label_chest_pain_4(row), axis=1)
heart_train = heart_train.drop('chest_pain_type', axis=1)

In [8]:
def upsloping(row):
    if row["peak_exercise_st_segment"] == 1: return 1
    else: return 0

def flat(row):
    if row["peak_exercise_st_segment"] == 2: return 1
    else: return 0
    
def downsloping(row):
    if row["peak_exercise_st_segment"] == 3: return 1
    else: return 0
    
heart_train['upsloping'] = heart_train.apply(lambda row: upsloping(row), axis=1)
heart_train['flat'] = heart_train.apply(lambda row: flat(row), axis=1)
heart_train['downsloping'] = heart_train.apply(lambda row: downsloping(row), axis=1)
heart_train = heart_train.drop('peak_exercise_st_segment', axis=1)

In [9]:
# Binarize resting_ecg, normal as 0 and everything else as 1
# heart_train['resting_ecg'] = (heart_train['resting_ecg'] >= 1).astype(int)

In [10]:
# Binarize resting blood pressure to 0 for [120, 140] (healthy) and 1 otherwise
# heart_train['resting_blood_pressure'] = ((heart_train['resting_blood_pressure'] > 140) | (heart_train['resting_blood_pressure'] < 120)).astype(int)
# heart_train

In [11]:
# Split data train
# heart_train_copy = heart_train.copy()
Y = heart_train['heart_disease_diagnosis']
X = heart_train.drop('heart_disease_diagnosis', axis = 1)

# scaler = StandardScaler().fit(X)
# X = scaler.transform(X)

# Best so far
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

KF = KFold(10, shuffle=True)

  return self.partial_fit(X, y)


## Algoritma Naive-Bayes

In [12]:
gnb = GaussianNB()

results = cross_validate(gnb, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

# i=1
# sum_acc = 0
# sum_prec = 0
# sum_rec = 0
# for trainidx, testidx in KF.split(X):
#     X_train, X_test = X.iloc[trainidx], X.iloc[testidx]
#     Y_train, Y_test = Y.iloc[trainidx], Y.iloc[testidx]
#     gnb.fit(X_train,Y_train)

#     accuration = metrics.accuracy_score(Y_test, gnb.predict(X_test))
#     precision = metrics.precision_score(Y_test, gnb.predict(X_test), average="macro")
#     recall = metrics.recall_score(Y_test, gnb.predict(X_test), average="macro")
    
#     i+=1
#     sum_acc += accuration
#     sum_prec += precision
#     sum_rec += recall
    
# print("Average Accuration : {0:.4f}".format(sum_acc/10))
# print("Average Precision : {0:.4f}".format(sum_prec/10))
# print("Average Recall : {0:.4f}".format(sum_rec/10))

{'fit_time': array([0.00144219, 0.00095463, 0.00114965, 0.00095272, 0.00094557,
       0.00097489, 0.00098896, 0.00125766, 0.00114202, 0.00099063]),
 'score_time': array([0.00049019, 0.00040913, 0.00040722, 0.00039148, 0.00037789,
       0.00052905, 0.00039315, 0.00046015, 0.00045228, 0.00037694]),
 'test_score': array([0.52631579, 0.53333333, 0.56      , 0.56756757, 0.58108108,
       0.50684932, 0.5890411 , 0.5890411 , 0.58333333, 0.57746479]),
 'train_score': array([0.6030303 , 0.602118  , 0.60514372, 0.60876133, 0.59516616,
       0.60935143, 0.59879336, 0.59276018, 0.59789157, 0.59398496])}
0.5614027400370709


## Algoritma KNN

In [13]:
knn = KNeighborsClassifier(n_neighbors=21, weights='distance')

results = cross_validate(knn, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)
# i=1
# sum_acc = 0
# sum_prec = 0
# sum_rec = 0
# for trainidx, testidx in KF.split(X):
#     X_train, X_test = X.iloc[trainidx], X.iloc[testidx]
#     Y_train, Y_test = Y.iloc[trainidx], Y.iloc[testidx]
#     knn.fit(X_train,Y_train)

#     accuration = metrics.accuracy_score(Y_test, knn.predict(X_test))
#     precision = metrics.precision_score(Y_test, knn.predict(X_test), average="macro")
#     recall = metrics.recall_score(Y_test, knn.predict(X_test), average="macro")

#     i+=1
#     sum_acc += accuration
#     sum_prec += precision
#     sum_rec += recall
    
# print("Average Accuration : {0:.4f}".format(sum_acc/10))
# print("Average Precision : {0:.4f}".format(sum_prec/10))
# print("Average Recall : {0:.4f}".format(sum_rec/10))

{'fit_time': array([0.00093961, 0.00108695, 0.00079322, 0.0007894 , 0.00073552,
       0.00069547, 0.00073433, 0.00070667, 0.00068331, 0.00070596]),
 'score_time': array([0.00197411, 0.00214195, 0.00205088, 0.00195265, 0.00188208,
       0.00183415, 0.00181556, 0.00178123, 0.00175333, 0.00189757]),
 'test_score': array([0.55263158, 0.61333333, 0.57333333, 0.62162162, 0.58108108,
       0.54794521, 0.57534247, 0.54794521, 0.55555556, 0.63380282]),
 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}
0.580259219748603


# Algoritma MLP 

In [17]:
mlp = MLPClassifier(hidden_layer_sizes=(2), solver='sgd', 
                    max_iter=500, learning_rate_init=0.1, learning_rate='adaptive',
                    activation='identity')

results = cross_validate(mlp, X, Y, cv=10)
pprint(results)

print(sum(results['test_score'])/10)

{'fit_time': array([0.21113229, 0.20123005, 0.21560025, 0.19209337, 0.17941642,
       0.22547412, 0.20017695, 0.1860261 , 0.18797183, 0.17953992]),
 'score_time': array([0.00039673, 0.00038338, 0.00033236, 0.00035906, 0.00032187,
       0.00038862, 0.00033355, 0.00031924, 0.00033188, 0.00031471]),
 'test_score': array([0.56578947, 0.65333333, 0.54666667, 0.59459459, 0.59459459,
       0.57534247, 0.61643836, 0.53424658, 0.54166667, 0.66197183]),
 'train_score': array([0.59393939, 0.59152799, 0.60816944, 0.60120846, 0.59365559,
       0.59426848, 0.60331825, 0.59125189, 0.59789157, 0.59548872])}
0.5884644557786256


In [15]:
# mlp.fit(X[100:], Y[100:])
# mlp.predict(X)