<a href="https://colab.research.google.com/github/Found-Bugs/Pembelajaran-Mesin/blob/Ensemble-Learning/Tugas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tugas 1

Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [123]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder # Kebutuhan encoding label

In [124]:
# Load data
df = pd.read_csv('/content/drive/MyDrive/ML/Dataset/Ensemble_Learning/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [125]:
# Cek kolom null
df.isnull().sum()

Unnamed: 0,0
class,0
cap-shape,0
cap-surface,0
cap-color,0
bruises,0
odor,0
gill-attachment,0
gill-spacing,0
gill-size,0
gill-color,0


In [126]:
# Seleksi fitur
X = df.drop(columns='class')
y = df['class']

# encode label
ec = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object':  # Hanya terapkan pada kolom kategori (tipe data 'object')
        X[col] = ec.fit_transform(X[col])

y = ec.fit_transform(y)

# Cek jumlah fitur dan instance
print(X.shape)

(8124, 22)


In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [128]:
# Secara default, DecisionTreeClassifier dari scikit-learn akan menggunakan nilai "Gini" untuk kriteria
# Terdapat beberapa "hyperparamater" yang dapat digunakan. Silahka baca dokumentasi
# Pada kasus ini kita akan menggunakan parameter default

# Define the hyperparameter grid for Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

dt = DecisionTreeClassifier(random_state=42)

# Perform Grid Search with Cross Validation for Decision Tree
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Get the best models from the Grid Search
best_dt = grid_search_dt.best_estimator_

# Test the models on the test set
y_pred_dt = best_dt.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)

In [129]:
# Pada kasus kali ini kita akan menggunakan estimator pada RandomForest
# Untuk detail parameter (hyperparameter) silahkan cek dokumentasi

# Define the hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

# Perform Grid Search with Cross Validation for Random Forest
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Get the best models from the Grid Search
best_rf = grid_search_rf.best_estimator_

# Test the models on the test set
y_pred_rf = best_rf.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)

In [130]:
print(f"Decision Tree accuracy: {accuracy_dt}")
print(f"Random Forest accuracy: {accuracy_rf}")

print(f"Decision Tree best models: {grid_search_dt.best_params_}")
print(f"Random Forest best models: {grid_search_rf.best_params_}")

Decision Tree accuracy: 1.0
Random Forest accuracy: 1.0
Decision Tree best models: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2}
Random Forest best models: {'bootstrap': True, 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}


# Tugas 2

Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

Dikarenakan Algoritma Decision Tree sudah dilakukan pada Tugas 1, pada Tugas 2 kali ini kita akan berfokus untuk membuat algoritma AdaBoost-nya saja.

In [131]:
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost

In [132]:
# Pada kasus kali ini kita akan menggunakan estimator pada AdaBoost
# Untuk detail parameter (hyperparameter) silahkan cek dokumentasi

# Define the hyperparameter grid for AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1, 10]
}

ada = AdaBoostClassifier(random_state=42, algorithm='SAMME')

# Perform Grid Search with Cross Validation for AdaBoost
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, scoring='accuracy')
grid_search_ada.fit(X_train, y_train)

# Get the best AdaBoost model from the Grid Search
best_ada = grid_search_ada.best_estimator_

# Test the AdaBoost model on the test set
y_pred_ada = best_ada.predict(X_test)

# Calculate accuracy for AdaBoost
accuracy_ada = accuracy_score(y_test, y_pred_ada)

In [133]:
print(f"Decision Tree accuracy: {accuracy_dt}")
print(f"AdaBoost accuracy: {accuracy_ada}")

print(f"Decision Tree best models: {grid_search_dt.best_params_}")
print(f"AdaBoost best models: {grid_search_ada.best_params_}")

Decision Tree accuracy: 1.0
AdaBoost accuracy: 1.0
Decision Tree best models: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2}
AdaBoost best models: {'learning_rate': 1, 'n_estimators': 200}


# Tugas 3

Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma

1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

In [134]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # import SVM classifier
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

In [135]:
# Load Data

dbt = pd.read_csv('/content/drive/MyDrive/ML/Dataset/Ensemble_Learning/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [136]:
# Cek nama kolom
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [137]:
# Cek kolom null
dbt.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [138]:
# Pada kasus ini, agak tidak masuk akal jika beberapa parameter bernilai 0
# sebagai contoh adalah nilai 'Glucose', 'BloodPlessure' ataupun 'Insulin'.
# Sekecil apapun nilainya, setiap manusia yang hidup pasti miliki nilai-nilai tersebut

# Kita akan manipulasi nilai yang 0 dengan melakukan 'imputasi' atau mengganti nilainya dengan nilai sintetis
# Pada kasus ini, kita akan menggunakan nilai mean

# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [139]:
# Impute nilai 0 dengan mean
fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

In [140]:
X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [141]:
# Define the hyperparameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs', 'newton-cg']
}

# Initialize the Logistic Regression model
lr = LogisticRegression(random_state=42, max_iter=1000)

# Perform Grid Search with Cross Validation for Logistic Regression
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)

# Get the best Logistic Regression model from the Grid Search
best_lr = grid_search_lr.best_estimator_

# Test the Logistic Regression model on the test set
y_pred_lr = best_lr.predict(X_test)

# Calculate accuracy for Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print(f"Logistic Regression accuracy: {accuracy_lr}")
print(f"Logistic Regression best models: {grid_search_lr.best_params_}")

Logistic Regression accuracy: 0.7402597402597403
Logistic Regression best models: {'C': 10, 'solver': 'liblinear'}


In [142]:
# Define the hyperparameter grid for SVM with polynomial kernel
param_grid_svm = {
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4]
}

# Initialize the SVM model with polynomial kernel
svm_poly = SVC(kernel='poly', random_state=42)

# Perform Grid Search with Cross Validation for SVM
grid_search_svm = GridSearchCV(estimator=svm_poly, param_grid=param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

# Get the best SVM model from the Grid Search
best_svm = grid_search_svm.best_estimator_

# Test the SVM model on the test set
y_pred_svm = best_svm.predict(X_test)

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print(f"SVM accuracy: {accuracy_svm}")
print(f"SVM best models: {grid_search_svm.best_params_}")

SVM accuracy: 0.7489177489177489
SVM best models: {'C': 10, 'degree': 3}


In [143]:
# Define the hyperparameter grid for Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

dt = DecisionTreeClassifier(random_state=42)

# Perform Grid Search with Cross Validation for Decision Tree
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Get the best models from the Grid Search
best_dt = grid_search_dt.best_estimator_

# Test the models on the test set
y_pred_dt = best_dt.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print(f"Decision Tree accuracy: {accuracy_dt}")
print(f"Decision Tree best models: {grid_search_dt.best_params_}")

Decision Tree accuracy: 0.7359307359307359
Decision Tree best models: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 10}


In [144]:
# model hard voting
voting = VotingClassifier(estimators=[('Logistic Regression', best_lr), ('SVM', best_svm), ('Decision Tree', best_dt)], voting='hard')

# Fit model
voting.fit(X_train, y_train)

# Prediksi
y_pred_vt1 = voting.predict(X_test)

# Evaluasi akurasi testing data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)

# Print hasil evaluasi
print('Voting Hard')
print(f"Test set accuracy: {acc_vt1}")

Voting Hard
Test set accuracy: 0.7532467532467533
