# Tugas 3
Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
- Logistic Regression
- SVM kernel polynomial
- Decission Tree
Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

In [178]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.naive_bayes import GaussianNB # import Naive Bayes model Gaussian (asumsi data terdistribusi normal)
from sklearn.svm import SVC # import SVM classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, StackingClassifier # import model Voting# import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [179]:
# Load data
df = pd.read_csv('data/diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [180]:
# Cek kolom null
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [181]:
feature_columns = df.iloc[:,:-1].columns
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(df.loc[df[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [182]:
# Impute nilai 0 dengan mean


fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

df[feature_columns] = fill_values.fit_transform(df[feature_columns])

In [183]:
# Seleksi fitur

# Split data X dan y (class)
X = df.iloc[:,:-1]
y = df['Outcome']

# Cek jumlah fitur dan instance
X.shape

(768, 8)

In [184]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)

In [185]:
# Karena asumsi Gaussian NB adalah data terdistribusi secara normal,
# maka kita perlu melakukan standarisasi

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

## Decision Tree

In [186]:
# Secara default, DecisionTreeClassifier dari scikit-learn akan menggunakan nilai "Gini" untuk kriteria
# Terdapat beberapa "hyperparamater" yang dapat digunakan. Silahka baca dokumentasi
# Pada kasus ini kita akan menggunakan parameter default
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train_std, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test_std)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 0.68
Test set accuracy: 0.6818181818181818


## SVM Polynomial

In [187]:
# Model SVM linier tanpa tunnning hyperparameter
svm_poly = SVC(kernel='poly')

# Fit ke model
svm_poly.fit(X_train_std, y_train)

# Prediksi
y_pred_svm_poly = svm_poly.predict(X_test_std)

# Evaluasi akurasi testing data
acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)

# Print hasil evaluasi
print("Test set accuracy: {:.2f}".format(acc_svm_poly))
print(f"Test set accuracy: {acc_svm_poly}")

Test set accuracy: 0.73
Test set accuracy: 0.7337662337662337


## Logistic Regression

In [188]:
# Secara default, DecisionTreeClassifier dari scikit-learn akan menggunakan nilai "Gini" untuk kriteria
# Terdapat beberapa "hyperparamater" yang dapat digunakan. Silahka baca dokumentasi
# Pada kasus ini kita akan menggunakan parameter default
lr = LogisticRegression(solver='liblinear', random_state=0)

# Sesuaikan lr ke set training
lr.fit(X_train_std, y_train)

# Memprediksi label set test
y_pred_lr = lr.predict(X_test_std)

#  menghitung set accuracy
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Test set accuracy: {:.2f}".format(acc_lr))
print(f"Test set accuracy: {acc_lr}")

Test set accuracy: 0.70
Test set accuracy: 0.7012987012987013


## Voting Classifier

In [189]:
# Definisikan algoritma yang akan digunakan untuk voting

clf1 = LogisticRegression(solver='liblinear')
clf2 = SVC(kernel='poly')
clf3 = DecisionTreeClassifier()

# model hard voting
voting = VotingClassifier(estimators=[('Logistic Regression',clf1),('Decision Tree', clf2), ('SVM-POLY', clf3)],voting='hard')

# Fit model
voting.fit(X_train_std, y_train)

# Prediksi
y_pred_vt1 = voting.predict(X_test_std)

# Evaluasi akurasi testing data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)


# Print hasil evaluasi
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")



Voting Hard
Test set accuracy: 0.75
Test set accuracy: 0.7467532467532467


## Stacking

**One Layer**

- Base Models : Decision Tree and SVC-poly
- Meta Model : LogisticRegression

In [190]:
layer_one_estimators = [
                        ('dt_1', DecisionTreeClassifier()),
                        ('svm_1', SVC(kernel='poly'))             
                       ]
                      
clf= StackingClassifier(estimators=layer_one_estimators, final_estimator=LogisticRegression())

clf.fit(X_train_std, y_train).score(X_test_std, y_test)


0.7337662337662337