In [1]:
# Q6: RandomForest on Breast Cancer - top 5 features
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("Top 5 features (feature : importance)")
for i in indices[:5]:
    print(f"{feature_names[i]} : {importances[i]:.4f}")

print("Test accuracy:", accuracy_score(y_test, rf.predict(X_test)))


Top 5 features (feature : importance)
worst perimeter : 0.1331
worst area : 0.1281
worst concave points : 0.1081
mean concave points : 0.0944
worst radius : 0.0906
Test accuracy: 0.956140350877193


In [3]:
# Q7: Train a Bagging Classifier using Decision Trees on the Iris dataset and compare its accuracy with a single Decision Tree.
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Single Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_acc = accuracy_score(y_test, dt.predict(X_test))
print(f"Single Decision Tree Accuracy: {dt_acc:.4f}")

# Bagging Classifier with DecisionTree base estimator
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)
bag.fit(X_train, y_train)
bag_acc = accuracy_score(y_test, bag.predict(X_test))
print(f"Bagging Classifier Accuracy: {bag_acc:.4f}")


Single Decision Tree Accuracy: 0.8947
Bagging Classifier Accuracy: 0.9211


In [4]:
# Q8: RandomForest + GridSearchCV (tune max_depth and n_estimators)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20]
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)
best_model = grid.best_estimator_
print("Test accuracy (best model):", accuracy_score(y_test, best_model.predict(X_test)))


Best parameters: {'max_depth': None, 'n_estimators': 200}
Best CV accuracy: 0.9604395604395606
Test accuracy (best model): 0.956140350877193


In [6]:
# Q9: Train a Bagging Regressor and Random Forest Regressor on the California Housing dataset and compare their MSEs.

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error

# --- Load dataset ---
housing = fetch_california_housing()
X, y = housing.data, housing.target

# --- Split into train and test sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Scale features (optional for tree models, but keeps consistency) ---
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# --- Bagging Regressor with DecisionTree base estimator ---
bag_reg = BaggingRegressor(
    estimator=DecisionTreeRegressor(random_state=42),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)
bag_reg.fit(X_train_s, y_train)
y_pred_bag = bag_reg.predict(X_test_s)

# --- Random Forest Regressor ---
rf_reg = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
rf_reg.fit(X_train_s, y_train)
y_pred_rf = rf_reg.predict(X_test_s)

# --- Compare Mean Squared Errors ---
mse_bag = mean_squared_error(y_test, y_pred_bag)
mse_rf = mean_squared_error(y_test, y_pred_rf)

print(f"Bagging Regressor MSE: {mse_bag:.4f}")
print(f"Random Forest Regressor MSE: {mse_rf:.4f}")


Bagging Regressor MSE: 0.2570
Random Forest Regressor MSE: 0.2552


In [7]:
# Q10: Loan Default Prediction using Ensemble Learning

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import make_classification

# --- Simulated dataset (replace with real customer data) ---
X, y = make_classification(
    n_samples=2000, n_features=15, n_informative=8, n_redundant=2,
    n_clusters_per_class=2, weights=[0.7, 0.3], flip_y=0.02, random_state=42
)

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Random Forest (Bagging) ---
rf = RandomForestClassifier(
    n_estimators=200, max_depth=8, min_samples_leaf=5, random_state=42
)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

# --- Gradient Boosting (Boosting) ---
gb = GradientBoostingClassifier(
    n_estimators=150, learning_rate=0.1, max_depth=3, random_state=42
)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
gb_auc = roc_auc_score(y_test, gb.predict_proba(X_test)[:, 1])

# --- Cross-validation (5-fold) ---
rf_cv = cross_val_score(rf, X_train, y_train, cv=5, scoring="roc_auc")
gb_cv = cross_val_score(gb, X_train, y_train, cv=5, scoring="roc_auc")

print("Random Forest  AUC:", rf_auc, "|  Mean CV:", rf_cv.mean())
print("Gradient Boost AUC:", gb_auc, "|  Mean CV:", gb_cv.mean())


Random Forest  AUC: 0.8869042329452888 |  Mean CV: 0.912274051130322
Gradient Boost AUC: 0.891229005598507 |  Mean CV: 0.9087050991632379
