# Mini Project: Trees and Forests

Working with decision trees, random forests, and AdaBoost on the breast cancer dataset.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

## Load the data

First let's load the breast cancer dataset and split it up

In [2]:
# Load the dataset
cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
df['target'] = cancer_data.target

In [3]:
# Split into train/test
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Check the data
df.head()

## Decision Tree

Let's try a decision tree classifier first

In [5]:
# Train a decision tree
tree_model = DecisionTreeClassifier(random_state=99)
tree_model.fit(X_train, y_train)

In [6]:
# Look at the tree structure
plt.figure(figsize=(20,10))
plot_tree(tree_model, 
         feature_names=X.columns, 
         class_names=['Malignant', 'Benign'], 
         filled=True, 
         rounded=True)
plt.show()

In [7]:
# Try different max_depth values to see the effect
depths = [3, 5, 10, None]
for depth in depths:
    tree_model_tuned = DecisionTreeClassifier(max_depth=depth, random_state=99)
    tree_model_tuned.fit(X_train, y_train)
    train_score = tree_model_tuned.score(X_train, y_train)
    test_score = tree_model_tuned.score(X_test, y_test)
    print(f"Max Depth: {depth}, Train Score: {train_score:.3f}, Test Score: {test_score:.3f}")

## Random Forest

Now let's try a random forest classifier

In [8]:
# Train a random forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=99)
rf_model.fit(X_train, y_train)

In [9]:
# Check feature importances
feature_importances = dict(zip(X.columns, rf_model.feature_importances_))
sorted_importances = dict(sorted(feature_importances.items(), key=lambda item: item[1], reverse=True))

In [10]:
# Show top features
sorted_importances

In [11]:
# Plot feature importances
plt.figure(figsize=(12, 8))
features = list(sorted_importances.keys())[:15]
importances = list(sorted_importances.values())[:15]

plt.barh(range(len(features)), importances)
plt.yticks(range(len(features)), features)
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## AdaBoost

Now let's try AdaBoost with a weak learner

In [12]:
# Train AdaBoost
ada_model = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1, random_state=99),
    n_estimators=100,
    random_state=99
)
ada_model.fit(X_train, y_train)

In [13]:
# Compare all models
y_pred_ada = ada_model.predict(X_test)
accuracy_ada = accuracy_score(y_test, y_pred_ada)
print(f"AdaBoost Accuracy: {accuracy_ada * 100:.2f}%")

y_pred_tree = tree_model.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f"Tree Accuracy: {accuracy_tree * 100:.2f}%")

y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"RF Accuracy: {accuracy_rf * 100:.2f}%")


AdaBoost Accuracy: 97.37%
Tree Accuracy: 94.74%
RF Accuracy: 96.49%


## Results

AdaBoost performed best with 97.37% accuracy, followed by Random Forest at 96.49%, and the single decision tree at 94.74%.

The ensemble methods (Random Forest and AdaBoost) performed better than the single decision tree, which makes sense since they combine multiple models.

In [14]:
# Try cross-validation
from sklearn.model_selection import cross_val_score

print("Cross-validation scores (5-fold):")
print(f"Decision Tree: {cross_val_score(tree_model, X, y, cv=5).mean():.3f} (+/- {cross_val_score(tree_model, X, y, cv=5).std() * 2:.3f})")
print(f"Random Forest: {cross_val_score(rf_model, X, y, cv=5).mean():.3f} (+/- {cross_val_score(rf_model, X, y, cv=5).std() * 2:.3f})")
print(f"AdaBoost: {cross_val_score(ada_model, X, y, cv=5).mean():.3f} (+/- {cross_val_score(ada_model, X, y, cv=5).std() * 2:.3f})")