## 1.Compare the performance between the Decision Tree and Random Forest algorithms

1. Import Library

In [37]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

2. Load Dataset

In [38]:
df = pd.read_csv('assets/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [39]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [40]:
X = df.drop(['class'], axis=1)
y = df['class']

X = pd.get_dummies(X, dtype=int)
y = pd.get_dummies(y, dtype=int)

In [41]:
X

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8120,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
8121,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [42]:
y

Unnamed: 0,e,p
0,0,1
1,1,0
2,1,0
3,0,1
4,1,0
...,...,...
8119,1,0
8120,1,0
8121,1,0
8122,0,1


3. Split Dataset

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

4. Training Decision Tree Model

In [44]:
dt_classifier = DecisionTreeClassifier(random_state=42)

In [45]:
# Define hyperparameter grid for Decision Tree
param_grid_dt = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [46]:
# Grid Search for Decision Tree
grid_search_dt = GridSearchCV(dt_classifier, param_grid=param_grid_dt, cv=5)
grid_search_dt.fit(X_train, y_train)

best_dt_classifier = grid_search_dt.best_estimator_
y_pred_dt = best_dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

In [47]:
print("Decision Tree Accuracy: {:.2f}%".format(accuracy_dt * 100))

Decision Tree Accuracy: 100.00%


5. Training Random Forest Model

In [48]:
rf = RandomForestClassifier(n_estimators=10, random_state=1)

# Sesuaikan dt ke set training
rf.fit(X_train, y_train)

# Memprediksi label set test
y_pred_rf = rf.predict(X_test)

#  menghitung set accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")

Test set accuracy: 1.00
Test set accuracy: 1.0


## 2. Compare the performance between the Decision Tree and AdaBoost algorithms

1. Import Library

In [49]:
# Import Library
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder # Kebutuhan encoding label

2. Load Dataset

In [50]:
df2 = pd.read_csv('assets/mushrooms.csv')
df2.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [51]:
df2.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [52]:
# Seleksi fitur
X2 = df2.drop(['class'], axis=1)
y2 = df2['class']

# encode label
ec = LabelEncoder()
y2 = ec.fit_transform(y2)
X2 = pd.get_dummies(X2, dtype=int)

3. Split Dataset

In [53]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=1)

4. Training Decision Tree Model

In [54]:
# Pada kasus ini kita akan menggunakan parameter default
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train2, y_train2)

# Memprediksi label set test
y_test_pred_dt2 = dt.predict(X_test2)

#  menghitung set accuracy test
acc_test_dt2 = accuracy_score(y_test2, y_test_pred_dt2)
print(f"Test set accuracy: {acc_test_dt2*100:.2f}%")

# Memprediksi label set training
y_train_pred_dt2 = dt.predict(X_train2)

#  menghitung set accuracy training
acc_train_dt2 = accuracy_score(y_train2, y_train_pred_dt2)
print(f"Train set accuracy: {acc_train_dt2*100:.2f}%")

Test set accuracy: 100.00%
Train set accuracy: 100.00%


5. Training AdaBoost Model

In [55]:
# Pada kasus kali ini kita akan menggunakan estimator pada AdaBoost
# Untuk detail parameter (hyperparameter) silahkan cek dokumentasi

ada = AdaBoostClassifier(n_estimators=1)

# Sesuaikan dt ke set training
ada.fit(X_train2, y_train2)

# Memprediksi label set test
y_pred_test_ada = ada.predict(X_test2)
acc_ada_test = accuracy_score(y_test2, y_pred_test_ada)
print(f"Test set accuracy: {acc_ada_test *100:.2f}%")

# Memprediksi label set training
y_pred_train_ada = ada.predict(X_train2)

#  menghitung set accuracy training
acc_ada_train = accuracy_score(y_train2, y_pred_train_ada)
print(f"Train set accuracy: {acc_ada_train*100:.2f}%")

Test set accuracy: 88.00%
Train set accuracy: 88.84%


## Create ensemble voting with Logistic Regression, SVM Kernel polynominal, and Decision Tree,

1. Import Library

In [56]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression # import Logistic Regression
from sklearn.svm import SVC # import SVM classifier
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

2. Load Dataset

In [57]:
df3 = pd.read_csv('assets/diabetes.csv')
df3.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [58]:
# Cek kolom null
df3.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [59]:
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(df3.loc[df3[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [60]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

df3[feature_columns] = fill_values.fit_transform(df3[feature_columns])

3. Split Dataset

In [61]:
X3 = df3[feature_columns]
y3 = df3.Outcome

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=42)

4. Standardize Data

In [65]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train3)

# Transform the training and testing data using the scaler
X_train_scaled3 = scaler.transform(X_train3)
X_test_scaled3 = scaler.transform(X_test3)
     

5. Training with Ensemble Voting

In [67]:
# Definisikan algoritma yang akan digunakan untuk voting

clf1 = LogisticRegression(max_iter=5000)
clf2 = SVC(kernel='poly')
clf3 = DecisionTreeClassifier()

# model hard voting
voting = VotingClassifier(estimators=[('LogisticRegression', clf1), ('SVM-Polynomial', clf2), ('DecisionTree', clf3)], voting='hard')

# Fit model
voting.fit(X_train_scaled3, y_train3)

# Prediksi for test set
y_pred_test3 = voting.predict(X_test_scaled3)

# Prediksi for train set
y_pred_train3 = voting.predict(X_train_scaled3)

# Evaluasi akurasi for test set
acc_test3 = accuracy_score(y_test3, y_pred_test3)

# Evaluasi akurasi for train set
acc_train3 = accuracy_score(y_train3, y_pred_train3)

# Print hasil evaluasi
print('Voting Hard')
print(f"Test set accuracy: {acc_test3*100:.2f}%")
print(f"Train set accuracy: {acc_train3*100:.2f}%")

Voting Hard
Test set accuracy: 77.92%
Train set accuracy: 84.36%
