## **Tugas 1**

Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

### Import Library

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Persiapan Data

In [18]:
data = pd.read_csv("data/mushrooms.csv")
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [19]:
# Cek nama kolom
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [20]:
# Cek kolom null
data.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [21]:
# Pisahkan fitur (features) dan target (target)
X = data.drop('class', axis=1)
y = data['class']

In [22]:
# Ubah data kategorikal menjadi one-hot encoding
X = pd.get_dummies(X)

### Split Data

In [23]:
# Pisahkan dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Make model

In [24]:
# Decision Tree
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)

# RandomForest
rf_classifier = RandomForestClassifier(random_state=42)

### Hyperparameter

In [25]:
# Hyperparameter tuning menggunakan GridSearchCV
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.001, 0.002, 0.003, 0.004]
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

### GridSearchCV

In [26]:
# Decision Tree GridSearchCV
dt_grid_search = GridSearchCV(dt_classifier, param_grid_dt, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)


In [27]:
# RandomForest GridSearchCV
rf_grid_search = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

### Model Terbaik

In [28]:
# Ambil model terbaik dari hasil GridSearchCV
best_dt_model = dt_grid_search.best_estimator_
best_rf_model = rf_grid_search.best_estimator_

# Prediksi menggunakan model terbaik
dt_predictions = best_dt_model.predict(X_test)
rf_predictions = best_rf_model.predict(X_test)

### Evaluasi Model

In [29]:
# Evaluasi akurasi
dt_accuracy = accuracy_score(y_test, dt_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

In [30]:
# Print results for Decision Tree
print("Decision Tree GridSearchCV Results:")
print("Best Parameters: ", dt_grid_search.best_params_)
print("Best Cross-validated Accuracy: {:.2f}%".format(dt_grid_search.best_score_ * 100))

# Print all results (optional)
print("\nAll Results:")
for mean_score, params in zip(dt_grid_search.cv_results_['mean_test_score'], dt_grid_search.cv_results_['params']):
    print("Mean Accuracy: {:.2f}% | Parameters: {}".format(mean_score * 100, params))

Decision Tree GridSearchCV Results:
Best Parameters:  {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Cross-validated Accuracy: 100.00%

All Results:
Mean Accuracy: 99.96% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Mean Accuracy: 99.96% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
Mean Accuracy: 99.96% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Mean Accuracy: 100.00% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Mean Accuracy: 100.00% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5}
Mean Accuracy: 100.00% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Mean Accuracy: 99.91% | Parameters: {'ccp_alpha': 0.001, 'max_d

In [31]:
# Print results for RF
print("\nRandomForest GridSearchCV Results:")
print("Best Parameters: ", rf_grid_search.best_params_)
print("Best Cross-validated Accuracy: {:.2f}%".format(rf_grid_search.best_score_ * 100))

# Print all results (optional)
print("\nAll Results:")
for mean_score, params in zip(rf_grid_search.cv_results_['mean_test_score'], rf_grid_search.cv_results_['params']):
    print("Mean Accuracy: {:.2f}% | Parameters: {}".format(mean_score * 100, params))


RandomForest GridSearchCV Results:
Best Parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Cross-validated Accuracy: 100.00%

All Results:
Mean Accuracy: 100.00% | Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Mean Accuracy: 100.00% | Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Accuracy: 100.00% | Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Accuracy: 100.00% | Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Mean Accuracy: 100.00% | Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Mean Accuracy: 100.00% | Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Mean Accuracy: 100.00% | Parameters: {'max_depth': None, '

## **Tugas 2**

In [32]:
from sklearn.ensemble import AdaBoostClassifier

### Split Data

In [33]:
# Pisahkan dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Make model

In [34]:
# Decision Tree
dt2_classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)

# AdaBoost
adaboost_classifier = AdaBoostClassifier(estimator=dt_classifier, random_state=42)

### Hyperparameter

In [35]:
# Hyperparameter tuning menggunakan GridSearchCV
param_grid_dt2 = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.001, 0.002, 0.003, 0.004]
}

param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
}

### GridSearchCV

In [36]:
# Decision Tree GridSearchCV
dt2_grid_search = GridSearchCV(dt_classifier, param_grid_dt, cv=5, scoring='accuracy')
dt2_grid_search.fit(X_train, y_train)

In [37]:
# AdaBoost GridSearchCV
adaboost_grid_search = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5, scoring='accuracy')
adaboost_grid_search.fit(X_train, y_train)

### Model Terbaik

In [38]:
# Ambil model terbaik dari hasil GridSearchCV
best_dt2_model = dt2_grid_search.best_estimator_
best_adaboost_model = adaboost_grid_search.best_estimator_

# Prediksi menggunakan model terbaik
dt2_predictions = best_dt2_model.predict(X_test)
adaboost_predictions = best_adaboost_model.predict(X_test)

### Evaluasi Model

In [39]:
# Evaluasi akurasi
dt2_accuracy = accuracy_score(y_test, dt2_predictions)
adaboost_accuracy = accuracy_score(y_test, adaboost_predictions)

In [40]:
# Print results for Decision Tree
print("Decision Tree GridSearchCV Results:")
print("Best Parameters: ", dt2_grid_search.best_params_)
print("Best Cross-validated Accuracy: {:.2f}%".format(dt2_grid_search.best_score_ * 100))

# Print all results (optional)
print("\nAll Results:")
for mean_score, params in zip(dt2_grid_search.cv_results_['mean_test_score'], dt2_grid_search.cv_results_['params']):
    print("Mean Accuracy: {:.2f}% | Parameters: {}".format(mean_score * 100, params))

Decision Tree GridSearchCV Results:
Best Parameters:  {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Cross-validated Accuracy: 100.00%

All Results:
Mean Accuracy: 99.96% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Mean Accuracy: 99.96% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
Mean Accuracy: 99.96% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Mean Accuracy: 100.00% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Mean Accuracy: 100.00% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5}
Mean Accuracy: 100.00% | Parameters: {'ccp_alpha': 0.001, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Mean Accuracy: 99.91% | Parameters: {'ccp_alpha': 0.001, 'max_d

In [41]:
# Print results for AdaBoost
print("\nAdaBoost GridSearchCV Results:")
print("Best Parameters: ", adaboost_grid_search.best_params_)
print("Best Cross-validated Accuracy: {:.2f}%".format(adaboost_grid_search.best_score_ * 100))

# Print all results (optional)
print("\nAll Results:")
for mean_score, params in zip(adaboost_grid_search.cv_results_['mean_test_score'], adaboost_grid_search.cv_results_['params']):
    print("Mean Accuracy: {:.2f}% | Parameters: {}".format(mean_score * 100, params))


AdaBoost GridSearchCV Results:
Best Parameters:  {'learning_rate': 0.01, 'n_estimators': 50}
Best Cross-validated Accuracy: 99.96%

All Results:
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.01, 'n_estimators': 50}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.01, 'n_estimators': 100}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.01, 'n_estimators': 200}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.1, 'n_estimators': 100}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.1, 'n_estimators': 200}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.5, 'n_estimators': 50}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.5, 'n_estimators': 100}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 0.5, 'n_estimators': 200}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 1.0, 'n_estimators': 50}
Mean Accuracy: 99.96% | Parameters: {'learning_rate': 1

# **Tugas 3**

### Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_diabetes
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

### Persiapan Data

In [2]:
data2 = pd.read_csv("data/diabetes.csv")
data2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Cek nama kolom
data2.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
# Cek kolom null
data2.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
# Pisahkan fitur (features) dan target (target)
X = data2.drop('Outcome', axis=1)
y = data2['Outcome']

In [6]:
# Mengecek nilai NaN di setiap kolom
nan_values_per_column = data2.isna().sum()

# Menampilkan hasil
print(nan_values_per_column)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
# Mengganti semua nilai NaN dengan 0
data2_filled = data2.fillna(0.0)

# Menampilkan hasil
print(data2_filled)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

### Data Split

In [8]:
# Membagi data menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Normalisasi

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Non-Hyperparameter Tuning

#### Make Model

In [17]:
# Membuat klasifikasi individu
logreg_classifier = LogisticRegression(random_state=42)
svm_classifier = SVC(kernel='poly', degree=3, random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

In [18]:
# Membuat klasifikasi penggabungan dengan voting
voting_classifier = VotingClassifier(
    estimators=[
        ('logreg', logreg_classifier),
        ('svm', svm_classifier),
        ('dt', dt_classifier)
    ],
    voting='hard'
)

#### Melatih Model

In [51]:
# Menyesuaikan klasifikasi penggabungan pada data pelatihan
voting_classifier.fit(X_train_scaled, y_train)

#### Evaluasi Model

In [52]:
# Prediksi pada set pengujian
y_pred = voting_classifier.predict(X_test_scaled)

# Menghitung akurasi
akurasi = accuracy_score(y_test, y_pred)
print("Akurasi Voting Ensemble: {:.2f}%".format(akurasi * 100))

Akurasi Voting Ensemble: 76.19%


### Hyperparameter Tuning

#### Parameters

In [10]:
# Membuat klasifikasi individu dengan hyperparameter tuning
param_grid_logreg = {
    'C': [1, 10, 100],
    'penalty': ['l2']
}

param_grid_svm = {
    'C': [1, 10, 100],
    'gamma': [1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4]
}

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

#### Make model

In [11]:
# Membuat klasifikasi individu
logreg_classifier = LogisticRegression(random_state=42)
svm_classifier = SVC(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

#### GridSearchCV

In [12]:
# List untuk menyederhanakan pemanggilan fit
grids = [GridSearchCV(logreg_classifier, param_grid_logreg, cv=5, scoring='accuracy'),
         GridSearchCV(svm_classifier, param_grid_svm, cv=5, scoring='accuracy'),
         GridSearchCV(dt_classifier, param_grid_dt, cv=5, scoring='accuracy')]

# Loop untuk fitting
for grid in grids:
    grid.fit(X_train_scaled, y_train)

#### Training dengan Voting

In [11]:
# Membuat klasifikasi penggabungan dengan voting
voting_classifier = VotingClassifier(
    estimators=[
        ('logreg', grids[0]), 
        ('svm', grids[1]),  
        ('dt', grids[2]) 
    ],
    voting='hard'
)

voting_classifier.fit(X_train_scaled, y_train)

#### Melatih Model

In [12]:
# Menyesuaikan klasifikasi penggabungan pada data pelatihan
voting_classifier.fit(X_train_scaled, y_train)

#### Evaluasi Model

In [None]:
# Prediksi pada set pengujian
y_pred = voting_classifier.predict(X_test_scaled)

In [None]:
# Mencetak model terbaik
print("Best Logistic Regression Model:",  grids[0])
print("Best SVM Model:",  grids[1])
print("Best Decision Tree Model:",  grids[2])

In [None]:
# Menghitung akurasi
akurasi = accuracy_score(y_test, y_pred)
print("Akurasi Voting Ensemble setelah Hyperparameter Tuning: {:.2f}%".format(akurasi * 100))

Akurasi Voting Ensemble: 76.19%
