# Ensemble Method: Bagging
## Using optimized models from earlier experiments

In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans

In [2]:
from common import load_image, show_image, load_data, split_data, validation_scores, merge_results

# Load data
df = load_data()

# Split data
X_train, X_test, y_train, y_test = split_data(df)
# Preview shape of split data
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Add cluster labels to features
kmeans = KMeans(n_clusters=5, random_state=0)

X_train['cluster'] = kmeans.fit_predict(X_train)
X_test['cluster'] = kmeans.predict(X_test)

(5335, 256) (5335,) (1334, 256) (1334,)


In [3]:
# Base models with optimized parameters
mlp = MLPClassifier(
    activation='tanh', 
    alpha=0.01,
    hidden_layer_sizes=(50,), 
    solver='adam', 
    max_iter=500
)

svm = SVC(
    kernel='rbf', 
    C=10, 
    gamma=0.001, 
    probability=True
)

gnb = GaussianNB()

# !!!
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10
)

# Setup bagged versions
bagged_mlp = BaggingClassifier(base_estimator=mlp, n_estimators=10, random_state=42)
bagged_svm = BaggingClassifier(base_estimator=svm, n_estimators=10, random_state=42)
bagged_nb = BaggingClassifier(base_estimator=gnb, n_estimators=10, random_state=42)
bagged_rf = BaggingClassifier(base_estimator=rf, n_estimators=10, random_state=42)

In [4]:
# Combine them via the VotingClassifier
from sklearn.ensemble import VotingClassifier

# Use 'soft' voting for probability-based voting
bagging_ensemble = VotingClassifier(estimators=[
    ('mlp', bagged_mlp),
    ('svm', bagged_svm),
    ('nb', bagged_nb),
    ('rf', bagged_rf)
], voting='soft')


In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import pandas as pd

# Example evaluation
bagging_ensemble.fit(X_train, y_train)

y_pred = bagging_ensemble.predict(X_test)
y_pred_proba = bagging_ensemble.predict_proba(X_test)
best_acc = accuracy_score(y_test, y_pred)
best_f1 = f1_score(y_test, y_pred, average='weighted')
best_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

best_scores = pd.DataFrame({
    'Accuracy': [best_acc],
    'f1 Score': [best_f1],
    'ROC AUC': [best_roc_auc]
})

best_scores


Unnamed: 0,Accuracy,f1 Score,ROC AUC
0,0.898051,0.898338,0.988435


In [8]:
test_feature_data = pd.read_csv('data/test_feature_data.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

test_feature_data['cluster'] = kmeans.predict(test_feature_data)
predictions = bagging_ensemble.predict(test_feature_data)

# Create a submission DataFrame following the format of sample_submission.csv
submission = sample_submission.copy()
submission['prediction'] = predictions

# Save the submission to a CSV file
submission.to_csv('final_submission.csv', index=False)