# Assignment

In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

## Tugas 1

Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [2]:
# Load the dataset
df = pd.read_csv('data/mushrooms.csv')

# Preprocess the data
X = df.drop('class', axis=1)
y = df['class']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid for Decision Tree
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Define the parameter grid for RandomForest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Train and tune the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)
best_dt = grid_search_dt.best_estimator_

# Train and tune the RandomForest model
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_

# Evaluate the models
y_pred_dt = best_dt.predict(X_test)
y_pred_rf = best_rf.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f'Best Decision Tree Accuracy: {accuracy_dt}')
print(f'Best RandomForest Accuracy: {accuracy_rf}')

Best Decision Tree Accuracy: 1.0
Best RandomForest Accuracy: 1.0


## Tugas 2

Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [3]:
# Define the parameter grid for AdaBoost
param_grid_ab = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

# Train and tune the AdaBoost model
ab = AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=42), algorithm='SAMME', random_state=42)
grid_search_ab = GridSearchCV(ab, param_grid_ab, cv=5, scoring='accuracy')
grid_search_ab.fit(X_train, y_train)
best_ab = grid_search_ab.best_estimator_

# Evaluate the AdaBoost model
y_pred_ab = best_ab.predict(X_test)
accuracy_ab = accuracy_score(y_test, y_pred_ab)

print(f'Best AdaBoost Accuracy: {accuracy_ab}')

Best AdaBoost Accuracy: 1.0


## Tugas 3

Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma

1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

In [4]:
# Load the dataset
df_diabetes = pd.read_csv('./data/diabetes.csv')

# Preprocess the data
X_diabetes = df_diabetes.drop('Outcome', axis=1)
y_diabetes = df_diabetes['Outcome']

# Split the data into training and testing sets
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_diabetes, y_diabetes, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_diabetes = scaler.fit_transform(X_train_diabetes)
X_test_diabetes = scaler.transform(X_test_diabetes)

# Define the classifiers
log_clf = LogisticRegression(random_state=42)
svm_clf = SVC(kernel='poly', probability=True, random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)

# Define the parameter grids for hyperparameter tuning
param_grid_log = {
    'C': [0.1, 1, 10, 100]
}

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'degree': [2, 3, 4]
}

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Perform GridSearchCV for each classifier
grid_search_log = GridSearchCV(log_clf, param_grid_log, cv=5, scoring='accuracy')
grid_search_log.fit(X_train_diabetes, y_train_diabetes)
best_log_clf = grid_search_log.best_estimator_

grid_search_svm = GridSearchCV(svm_clf, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train_diabetes, y_train_diabetes)
best_svm_clf = grid_search_svm.best_estimator_

grid_search_dt = GridSearchCV(dt_clf, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train_diabetes, y_train_diabetes)
best_dt_clf = grid_search_dt.best_estimator_

# Create the VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('lr', best_log_clf), ('svc', best_svm_clf), ('dt', best_dt_clf)],
    voting='soft'
)

# Train the VotingClassifier
voting_clf.fit(X_train_diabetes, y_train_diabetes)

# Evaluate the VotingClassifier
y_pred_voting = voting_clf.predict(X_test_diabetes)
accuracy_voting = accuracy_score(y_test_diabetes, y_pred_voting)

print(f'Voting Classifier Accuracy: {accuracy_voting}')

Voting Classifier Accuracy: 0.7186147186147186
