### Tugas 3
Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
- Logistic Regression
- SVM kernel polynomial
- Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

In [19]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier 

In [20]:
# load dataset
df = pd.read_csv('data/diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [22]:
feature_colums = df.iloc[:,:-1].columns

for column in feature_colums:
    print("==============================================")
    print(f"{column} ==> Missing zeros : {len(df[df[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [23]:
# impute values 0 with mean
imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False)

df[feature_colums] = imputer.fit_transform(df[feature_colums])


In [24]:
# seleksi fitur
X = df.iloc[:,:-1]
y = df['Outcome']

#cek jumlah fitur dan lab
X.shape

(768, 8)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
sc = StandardScaler()

# Standarize features di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

### Decision Tree

In [28]:
# decision tree

dtc = DecisionTreeClassifier()
dtc.fit(X_train_std, y_train)

y_pred = dtc.predict(X_test_std)

print("Accuracy Score : ", accuracy_score(y_test, y_pred))
print("Classification Report : \n", classification_report(y_test, y_pred))


Accuracy Score :  0.7142857142857143
Classification Report : 
               precision    recall  f1-score   support

           0       0.78      0.78      0.78        99
           1       0.60      0.60      0.60        55

    accuracy                           0.71       154
   macro avg       0.69      0.69      0.69       154
weighted avg       0.71      0.71      0.71       154



### SVM Polynomial

In [30]:
# model svm Linear tanpa tuning parameter

svm = SVC(kernel='linear')
svm.fit(X_train_std, y_train)

# prediksi
y_pred = svm.predict(X_test_std)

# evaluasi akurasi testing data
acc_svc = accuracy_score(y_test, y_pred)

print("Accuracy Score : ", acc_svc)
print("Classification Report : \n", classification_report(y_test, y_pred))


Accuracy Score :  0.7597402597402597
Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.83      0.82        99
           1       0.67      0.64      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154



### Logistic Regression

In [32]:
lr = LogisticRegression(solver='liblinear', random_state=0)

#lr ke set data training
lr.fit(X_train_std, y_train)

# prediksi label dan set data testing
y_pred = lr.predict(X_test_std)

# evaluasi akurasi testing data
acc_lr = accuracy_score(y_test, y_pred)

print("Accuracy Score : ", acc_lr)
print("Classification Report : \n", classification_report(y_test, y_pred))



Accuracy Score :  0.7597402597402597
Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.83      0.82        99
           1       0.67      0.64      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154



### Voting Classifier

In [33]:
clf = LogisticRegression(solver='liblinear')
clf2 = DecisionTreeClassifier()
clf3 = SVC(kernel='poly')

# model hard training 
voting = VotingClassifier(estimators=[('lr', clf), ('dt', clf2), ('svm', clf3)], voting='hard')

# fit model
voting.fit(X_train_std, y_train)

# prediksi
y_pred = voting.predict(X_test_std)

# evaluasi akurasi testing data
acc_voting = accuracy_score(y_test, y_pred)

print("Accuracy Score : ", acc_voting)
print("Classification Report : \n", classification_report(y_test, y_pred))

Accuracy Score :  0.7727272727272727
Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.87      0.83        99
           1       0.72      0.60      0.65        55

    accuracy                           0.77       154
   macro avg       0.76      0.73      0.74       154
weighted avg       0.77      0.77      0.77       154

