<a href="https://colab.research.google.com/github/AaronK99/git_test/blob/main/Ensemble_Methods_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Excercise 1

Construct the following models on the same dataset:
- Bagging
- Random Forest
- Adaboost

Compare their performance and write a short paragraph on which one is the best. You are free to change the hyperparameters.


In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Construct and train Bagging model
bagging_model = BaggingClassifier(n_estimators=50, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_pred = bagging_model.predict(X_test)

# Construct and train Random Forest model
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Construct and train AdaBoost model
adaboost_model = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost_model.fit(X_train, y_train)
adaboost_pred = adaboost_model.predict(X_test)

# Evaluate the performance
bagging_acc = accuracy_score(y_test, bagging_pred)
rf_acc = accuracy_score(y_test, rf_pred)
adaboost_acc = accuracy_score(y_test, adaboost_pred)

# Display the results with more precision
results = pd.DataFrame({
    'Model': ['Bagging', 'Random Forest', 'AdaBoost'],
    'Accuracy': [bagging_acc, rf_acc, adaboost_acc]
})

print("Accuracy Scores:")
print(results)

print("\nClassification Reports:")
print("Bagging Classifier:")
print(classification_report(y_test, bagging_pred, digits=4))
print("Random Forest Classifier:")
print(classification_report(y_test, rf_pred, digits=4))
print("AdaBoost Classifier:")
print(classification_report(y_test, adaboost_pred, digits=4))

Accuracy Scores:
           Model  Accuracy
0        Bagging  0.959064
1  Random Forest  0.970760
2       AdaBoost  0.976608

Classification Reports:
Bagging Classifier:
              precision    recall  f1-score   support

           0     0.9516    0.9365    0.9440        63
           1     0.9633    0.9722    0.9677       108

    accuracy                         0.9591       171
   macro avg     0.9575    0.9544    0.9559       171
weighted avg     0.9590    0.9591    0.9590       171

Random Forest Classifier:
              precision    recall  f1-score   support

           0     0.9833    0.9365    0.9593        63
           1     0.9640    0.9907    0.9772       108

    accuracy                         0.9708       171
   macro avg     0.9736    0.9636    0.9683       171
weighted avg     0.9711    0.9708    0.9706       171

AdaBoost Classifier:
              precision    recall  f1-score   support

           0     0.9683    0.9683    0.9683        63
           1     0.9

The AdaBoost classifier performed the best on the Breast Cancer dataset with an accuracy of 97.66%, followed closely by the Random Forest classifier at 97.08%. The Bagging classifier had the lowest accuracy at 95.91%. Overall, AdaBoost's ability to handle misclassified instances adaptively gave it a slight edge over the others. While all three models performed well, AdaBoost is the best choice for this dataset.

# Exercise 2

The accuracy for this dataset is quite low. Can you try any other method that increases the accuracy. You can try either Random Forest or Adaboost. What do you notice?

In [5]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
cuisines_df = pd.read_csv("https://an-utd-python.s3.us-west-1.amazonaws.com/cuisines.csv")

# Separate features and labels
cuisines_label_df = cuisines_df['cuisine']
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

gb_grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=gb_param_grid, cv=3, n_jobs=-1, verbose=2)
gb_grid_search.fit(X_train_scaled, y_train)
best_gb = gb_grid_search.best_estimator_

# Evaluate the best Gradient Boosting model
y_pred_gb = best_gb.predict(X_test_scaled)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Best Gradient Boosting Model Accuracy:", accuracy_gb)
print(classification_report(y_test, y_pred_gb))


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Gradient Boosting Model Accuracy: 0.8165137614678899
              precision    recall  f1-score   support

     chinese       0.76      0.73      0.75       236
      indian       0.90      0.94      0.92       245
    japanese       0.78      0.78      0.78       231
      korean       0.83      0.77      0.80       242
        thai       0.80      0.86      0.83       245

    accuracy                           0.82      1199
   macro avg       0.82      0.82      0.81      1199
weighted avg       0.82      0.82      0.82      1199



The optimized Random Forest model performed the best on the cuisines dataset with an accuracy of 84.07%, making it the most effective choice. The Gradient Boosting model also showed good performance with an accuracy of 81.65%, but it did not surpass Random Forest. AdaBoost had the lowest accuracy at 67.14%, indicating it is less suitable for this dataset. Overall, Random Forest is recommended due to its highest accuracy and balanced performance across different cuisine classes. Further hyperparameter tuning or exploring other models could potentially enhance results even more.


# Exercise 3

Try other combination of hyperparameters for Random Forest and AdaBoost models and check how good of an accuracy you can obtain.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the dataset
cuisines_df = pd.read_csv("https://an-utd-python.s3.us-west-1.amazonaws.com/cuisines.csv")

# Separate features and labels
cuisines_label_df = cuisines_df['cuisine']
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3, random_state=42)

# Define pipelines with standard scaling
rf_pipe = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=42))])
ab_pipe = Pipeline([('scaler', StandardScaler()), ('ab', AdaBoostClassifier(random_state=42))])

# Define extended hyperparameter grids
rf_param_grid = {
    'rf__n_estimators': [100, 200, 300, 500],
    'rf__max_depth': [7, 9, 11, 13],
    'rf__max_features': ['auto', 'sqrt', 'log2'],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

ab_param_grid = {
    'ab__n_estimators': [50, 100, 200, 300],
    'ab__learning_rate': [0.01, 0.1, 0.5, 1.0]
}

# Perform grid search for Random Forest
rf_grid_search = GridSearchCV(estimator=rf_pipe, param_grid=rf_param_grid, cv=3, n_jobs=-1, verbose=2)
rf_grid_search.fit(X_train, y_train)
best_rf = rf_grid_search.best_estimator_

# Evaluate the best Random Forest model
y_pred_rf = best_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Best Random Forest Model Accuracy:", accuracy_rf)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print(classification_report(y_test, y_pred_rf))

# Perform grid search for AdaBoost
ab_grid_search = GridSearchCV(estimator=ab_pipe, param_grid=ab_param_grid, cv=3, n_jobs=-1, verbose=2)
ab_grid_search.fit(X_train, y_train)
best_ab = ab_grid_search.best_estimator_

# Evaluate the best AdaBoost model
y_pred_ab = best_ab.predict(X_test)
accuracy_ab = accuracy_score(y_test, y_pred_ab)
print("Best AdaBoost Model Accuracy:", accuracy_ab)
print("Best AdaBoost Parameters:", ab_grid_search.best_params_)
print(classification_report(y_test, y_pred_ab))

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
