# TUGAS 3
## Brilyan Satria Wahyuda
### TI-3H 05 2241720019

In [1]:

import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB # import Naive Bayes model Gaussian (asumsi data terdistribusi normal)
from sklearn.svm import SVC # import SVM classifier
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

### Persiapan data

In [2]:
dbt = pd.read_csv('diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Cek nama kolom
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
# Cek kolom null
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### Imputasi

In [5]:
# Pada kasus ini, agak tidak masuk akal jika beberapa parameter bernilai 0
# sebagai contoh adalah nilai 'Glucose', 'BloodPlessure' ataupun 'Insulin'.
# Sekecil apapun nilainya, setiap manusia yang hidup pasti miliki nilai-nilai tersebut

# Kita akan manipulasi nilai yang 0 dengan melakukan 'imputasi' atau mengganti nilainya dengan nilai sintetis
# Pada kasus ini, kita akan menggunakan nilai mean

# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [6]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

In [8]:

X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Inisialisasi scaler (StandardScaler atau MinMaxScaler)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Algoritma Logistic Regression

In [9]:

# Definisikan model Logistic Regression
logreg = LogisticRegression(max_iter=1000)

# Hyperparameter tuning menggunakan GridSearchCV untuk Logistic Regression
param_grid_logreg = {'C': [0.1, 1, 10, 100],
                    'solver': ['liblinear', 'lbfgs', 'newton-cg'],
                    }  # C adalah regularisasi untuk Logistic Regression
grid_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, scoring='accuracy')

# Fit GridSearch untuk menemukan hyperparameter terbaik
grid_logreg.fit(X_train, y_train)

# Model terbaik setelah tuning hyperparameter
best_logreg = grid_logreg.best_estimator_

# Prediksi
y_pred_logreg = best_logreg.predict(X_test)

# Evaluasi
print("Logistic Regression")
print("Best Parameters:", grid_logreg.best_params_)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_logreg) * 100:.2f} %")
print("\nClassification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression
Best Parameters: {'C': 1, 'solver': 'liblinear'}
Accuracy Score: 73.59 %

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.81      0.80       151
           1       0.63      0.59      0.61        80

    accuracy                           0.74       231
   macro avg       0.71      0.70      0.70       231
weighted avg       0.73      0.74      0.73       231



### SVM Kernel Polynomial

In [10]:

# Definisikan model SVM dengan kernel polynomial
svm = SVC(kernel='poly', probability=True)  # probability=True diperlukan untuk VotingClassifier

# Hyperparameter tuning menggunakan GridSearchCV untuk SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100]
}
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')

# Fit GridSearch untuk menemukan hyperparameter terbaik
grid_svm.fit(X_train, y_train)

# Model terbaik setelah tuning hyperparameter
best_svm = grid_svm.best_estimator_

# Prediksi
y_pred_svm = best_svm.predict(X_test)

# Evaluasi
print("SVM (Kernel Polynomial)")
print("Best Parameters:", grid_svm.best_params_)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_svm) * 100:.2f} %")
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

SVM (Kernel Polynomial)
Best Parameters: {'C': 1}
Accuracy Score: 69.70 %

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.88      0.79       151
           1       0.61      0.35      0.44        80

    accuracy                           0.70       231
   macro avg       0.66      0.62      0.62       231
weighted avg       0.68      0.70      0.67       231



### Decision Tree

In [11]:
# Definisikan model Decision Tree
tree = DecisionTreeClassifier()

# Hyperparameter tuning menggunakan GridSearchCV untuk Decision Tree
param_grid_tree = {
                'max_depth': [3, 5, 10],
                'min_samples_split': [2, 10, 20],
                'min_samples_leaf': [1, 5, 10, 15],
                'criterion': ['gini', 'entropy']
                } 
grid_tree = GridSearchCV(tree, param_grid_tree, cv=5, scoring='accuracy')

# Fit GridSearch untuk menemukan hyperparameter terbaik
grid_tree.fit(X_train, y_train)

# Model terbaik setelah tuning hyperparameter
best_tree = grid_tree.best_estimator_

# Prediksi
y_pred_tree = best_tree.predict(X_test)

# Evaluasi
print("Decision Tree")
print("Best Parameters:", grid_tree.best_params_)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_tree) * 100:.2f} %")
print("\nClassification Report:\n", classification_report(y_test, y_pred_tree))

Decision Tree
Best Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 15, 'min_samples_split': 10}
Accuracy Score: 77.06 %

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       151
           1       0.69      0.61      0.65        80

    accuracy                           0.77       231
   macro avg       0.75      0.73      0.74       231
weighted avg       0.77      0.77      0.77       231



### Penggabungan algoritma ke ensemble voting

In [17]:
# Membuat ensemble voting classifier tanpa hyperparameter tuning
voting_clf = VotingClassifier(
    estimators=[('logreg', logreg), ('svm', svm), ('tree', tree)],
    voting='soft'  # 'soft' untuk probabilistic voting
)

# Train voting classifier
voting_clf.fit(X_train, y_train)

# Prediksi
y_pred_voting = voting_clf.predict(X_test)

# Evaluasi Voting Classifier
print("\nVoting Classifier")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_voting) * 100:.2f} %")
print("\nClassification Report:\n", classification_report(y_test, y_pred_voting))


Voting Classifier
Accuracy Score: 74.03 %

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.80       151
           1       0.62      0.66      0.64        80

    accuracy                           0.74       231
   macro avg       0.72      0.72      0.72       231
weighted avg       0.75      0.74      0.74       231

