In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../dataset/stroke.csv", delimiter=';', decimal=',')
df = df.sample(frac = 1)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,smoking_status,stroke
4417,Female,48,0,0,Self-employed,1,33.3,Unknown,0
701,Male,38,0,0,Private,1,30.2,never smoked,0
4973,Male,65,0,0,Private,1,30.1,smokes,0
4013,Male,44,1,0,Private,1,26.5,never smoked,0
2229,Male,54,1,0,Self-employed,1,37.6,smokes,0


In [4]:
df["smoking_status"].unique()

array(['Self-employed', 'Private', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [1127]:
df['age'] = df['age'].astype(str)
df = df[~df['age'].str.contains(":")]
df = df[~df['age'].str.contains(",")]
df["age"].unique()

array(['7', '46', '11', '32', '82', '20', '23', '8', '71', '37', '64',
       '5', '18', '62', '80', '56', '50', '75', '42', '27', '61', '35',
       '28', '22', '66', '16', '3', '73', '53', '76', '57', '51', '33',
       '10', '55', '52', '81', '65', '14', '44', '54', '70', '9', '43',
       '25', '58', '69', '78', '29', '45', '79', '49', '63', '68', '67',
       '36', '40', '21', '4', '34', '17', '47', '59', '39', '72', '6',
       '48', '31', '38', '24', '13', '12', '77', '60', '41', '2', '74',
       '30', '15', '19', '26', '1'], dtype=object)

In [1128]:
df["hypertension"] = df['hypertension'].replace(['Yes', 'No'], ["1", "0"])
df["heart_disease"] = df['heart_disease'].replace(['Yes', 'No'], ["1", "0"])
df["gender"] = df['gender'].replace(['Male', 'Female'], ["0", "1"])
df

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,smoking_status,stroke
1619,1,7,0,0,children,1,20.8,Unknown,0
180,1,46,0,0,Private,1,30.8,never smoked,1
4439,0,11,0,0,children,1,27.6,never smoked,0
1181,1,32,0,0,Private,1,26.5,formerly smoked,0
3603,0,82,0,0,Self-employed,1,24.3,smokes,0
...,...,...,...,...,...,...,...,...,...
2031,1,32,0,0,Private,1,37.8,Unknown,0
456,0,75,0,1,Private,1,30.2,formerly smoked,0
438,0,53,0,0,Private,1,28.5,formerly smoked,0
2441,1,65,0,0,Govt_job,1,52.7,smokes,0


In [1129]:
from sklearn.preprocessing import LabelEncoder

le_work = LabelEncoder()
df['work_type'] = le_work.fit_transform(df['work_type'])

df

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,smoking_status,stroke
1619,1,7,0,0,4,1,20.8,Unknown,0
180,1,46,0,0,2,1,30.8,never smoked,1
4439,0,11,0,0,4,1,27.6,never smoked,0
1181,1,32,0,0,2,1,26.5,formerly smoked,0
3603,0,82,0,0,3,1,24.3,smokes,0
...,...,...,...,...,...,...,...,...,...
2031,1,32,0,0,2,1,37.8,Unknown,0
456,0,75,0,1,2,1,30.2,formerly smoked,0
438,0,53,0,0,2,1,28.5,formerly smoked,0
2441,1,65,0,0,0,1,52.7,smokes,0


In [1130]:
df = df.drop(df[df['bmi'] == "0"].index)
df = df.drop(df[df['smoking_status'] == "Unknown"].index)
df = df.drop(df[df['gender'] == "Other"].index)
df

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,smoking_status,stroke
180,1,46,0,0,2,1,30.8,never smoked,1
4439,0,11,0,0,4,1,27.6,never smoked,0
1181,1,32,0,0,2,1,26.5,formerly smoked,0
3603,0,82,0,0,3,1,24.3,smokes,0
1268,0,20,0,0,2,1,43.3,never smoked,0
...,...,...,...,...,...,...,...,...,...
5105,1,80,1,0,2,1,0.0,never smoked,0
3831,1,55,0,0,3,1,31.4,never smoked,0
456,0,75,0,1,2,1,30.2,formerly smoked,0
438,0,53,0,0,2,1,28.5,formerly smoked,0


In [1131]:
from sklearn.preprocessing import MinMaxScaler

scaler_bmi = MinMaxScaler()
df[['bmi']] = scaler_bmi.fit_transform(df[['bmi']])

scaler_age = MinMaxScaler()
df[['age']] = scaler_age.fit_transform(df[['age']])

df

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,smoking_status,stroke
180,1,0.500000,0,0,2,1,0.334783,never smoked,1
4439,0,0.013889,0,0,4,1,0.300000,never smoked,0
1181,1,0.305556,0,0,2,1,0.288043,formerly smoked,0
3603,0,1.000000,0,0,3,1,0.264130,smokes,0
1268,0,0.138889,0,0,2,1,0.470652,never smoked,0
...,...,...,...,...,...,...,...,...,...
5105,1,0.972222,1,0,2,1,0.000000,never smoked,0
3831,1,0.625000,0,0,3,1,0.341304,never smoked,0
456,0,0.902778,0,1,2,1,0.328261,formerly smoked,0
438,0,0.597222,0,0,2,1,0.309783,formerly smoked,0


In [1132]:
le_smoking = LabelEncoder()
df['smoking_status'] = le_smoking.fit_transform(df['smoking_status'])
df

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,smoking_status,stroke
180,1,0.500000,0,0,2,1,0.334783,1,1
4439,0,0.013889,0,0,4,1,0.300000,1,0
1181,1,0.305556,0,0,2,1,0.288043,0,0
3603,0,1.000000,0,0,3,1,0.264130,2,0
1268,0,0.138889,0,0,2,1,0.470652,1,0
...,...,...,...,...,...,...,...,...,...
5105,1,0.972222,1,0,2,1,0.000000,1,0
3831,1,0.625000,0,0,3,1,0.341304,1,0
456,0,0.902778,0,1,2,1,0.328261,0,0
438,0,0.597222,0,0,2,1,0.309783,0,0


In [1133]:
# Bagi data menjadi training, validation, dan testing sets
train, temp = train_test_split(df, train_size=0.8, random_state=42, shuffle=False)
validation, test = train_test_split(temp, train_size=0.5, random_state=42, shuffle=False)

In [1134]:
classColumn = "stroke"

train_feature = train.drop(classColumn, axis=1)
train_target = train[classColumn]

validation_feature = validation.drop(classColumn, axis=1)
validation_target = validation[classColumn]

test_feature = test.drop(classColumn, axis=1)
test_target = test[classColumn]

train_target

180     1
4439    0
1181    0
3603    0
1268    0
       ..
1574    0
4785    0
1992    0
4299    0
1473    0
Name: stroke, Length: 2852, dtype: int64

In [1135]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = DecisionTreeClassifier()
history = model.fit(train_feature, train_target)

# Evaluasi model pada data validation
y_pred_validation = model.predict(validation_feature)
print("Validation classification_report:\n" + str(classification_report(validation_target, y_pred_validation)))
print("Validation confusion_matrix:\n" + str(confusion_matrix(validation_target, y_pred_validation)))
print("Validation accuracy_score:\n" + str(accuracy_score(validation_target, y_pred_validation)))

Validation classification_report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       340
           1       0.12      0.19      0.15        16

    accuracy                           0.90       356
   macro avg       0.54      0.56      0.55       356
weighted avg       0.92      0.90      0.91       356

Validation confusion_matrix:
[[318  22]
 [ 13   3]]
Validation accuracy_score:
0.901685393258427


In [1136]:
# Evaluasi model pada data testing
y_pred_test = model.predict(test_feature)
print("Test classification_report:\n" + str(classification_report(test_target, y_pred_test)))
print("Test confusion_matrix:\n" + str(confusion_matrix(test_target, y_pred_test)))
print("Test accuracy_score:\n" + str(accuracy_score(test_target, y_pred_test)))

Test classification_report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92       335
           1       0.00      0.00      0.00        22

    accuracy                           0.86       357
   macro avg       0.47      0.46      0.46       357
weighted avg       0.88      0.86      0.87       357

Test confusion_matrix:
[[307  28]
 [ 22   0]]
Test accuracy_score:
0.8599439775910365


In [1137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(train_feature, train_target)
y_pred_validation = model.predict(validation_feature)

print("classification_report :\n" + str(classification_report(validation_target, y_pred_validation)))
print("confusion_matrix :\n" + str(confusion_matrix(validation_target, y_pred_validation)))
print("accuracy_score :\n" + str(accuracy_score(validation_target, y_pred_validation)))

print("\n\n")

# Evaluasi model pada data testing
y_pred_test = model.predict(test_feature)
print("Test classification_report:\n" + str(classification_report(test_target, y_pred_test)))
print("Test confusion_matrix:\n" + str(confusion_matrix(test_target, y_pred_test)))
print("Test accuracy_score:\n" + str(accuracy_score(test_target, y_pred_test)))

classification_report :
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       340
           1       0.33      0.06      0.11        16

    accuracy                           0.95       356
   macro avg       0.65      0.53      0.54       356
weighted avg       0.93      0.95      0.94       356

confusion_matrix :
[[338   2]
 [ 15   1]]
accuracy_score :
0.952247191011236



Test classification_report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       335
           1       0.33      0.05      0.08        22

    accuracy                           0.94       357
   macro avg       0.64      0.52      0.52       357
weighted avg       0.90      0.94      0.91       357

Test confusion_matrix:
[[333   2]
 [ 21   1]]
Test accuracy_score:
0.9355742296918768


In [1138]:
import pickle

pth = ""
# save the model to disk
filename = pth+'stroke.sav'
pickle.dump(model, open(filename, 'wb'))
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model.score(test_feature, test_target)
#y_pred2 = loaded_model.predict(test_feature)


with open('scaler_age.pkl', 'wb') as file:
    pickle.dump(scaler_age, file)

with open('scaler_bmi.pkl', 'wb') as file:
    pickle.dump(scaler_bmi, file)

with open('le_work.pkl', 'wb') as f:
    pickle.dump(le_work, f)

with open('le_smoking.pkl', 'wb') as f:
    pickle.dump(le_smoking, f)