<a href="https://colab.research.google.com/github/Don-Ho25/Colab_Git_Assignment2/blob/main/Lesson11/Assignment11_AI_ImageClassification_Using_RandomFores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Assignment11

DONG

[GITHUBLINK](https://github.com/Don-Ho25/Colab_Git_Assignment2/blob/main/Lesson11/Assignment11_AI_ImageClassification_Using_RandomFores.ipynb)



We have chosen the MNIST dataset. It is a subset of a larger set available from NIST (http://yann.lecun.com/exdb/mnist/)

The MNIST database of handwritten digits has a training set of 60,000 examples, and a test set of 10,000 examples.


In [None]:
# Dataset Selection and Preprocessing:
#     Choose a publicly available image dataset suitable for classification. Examples include datasets from Kaggle, UCI Machine Learning Repository, or Google Dataset Search.
#     Perform necessary preprocessing steps including:
#     Loading the image files.
#     Resizing images to a uniform size.
#     Normalizing pixel values.
#     Splitting the dataset into training and testing sets.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.datasets import mnist
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 1. DATA LOADING AND PREPROCESSING
print("\n1. LOADING AND PREPROCESSING DATA")
print("-" * 40)

# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print(f'Number of training images: {x_train.shape[0]}')
print(f'Number of testing images: {x_test.shape[0]}')
print(f'Image shape: {x_train.shape[1:]}')
print(f'Number of classes: {len(np.unique(y_train))}')

# Preprocessing steps
def preprocess_data(x_train, x_test, y_train, y_test):
    """
    Preprocess the MNIST data:
    1. Reshape images to 1D arrays
    2. Normalize pixel values
    3. Handle data types
    """
    # Reshape images from 28x28 to 784 (flattening)
    x_train_flat = x_train.reshape(x_train.shape[0], -1)
    x_test_flat = x_test.reshape(x_test.shape[0], -1)

    # Normalize pixel values to range [0, 1]
    x_train_norm = x_train_flat.astype('float32') / 255.0
    x_test_norm = x_test_flat.astype('float32') / 255.0

    print(f"Original image shape: {x_train.shape[1:]}")
    print(f"Flattened shape: {x_train_flat.shape[1]}")
    print(f"Pixel value range after normalization: [{x_train_norm.min():.1f}, {x_train_norm.max():.1f}]")

    return x_train_norm, x_test_norm, y_train, y_test

# Apply preprocessing
x_train_processed, x_test_processed, y_train_processed, y_test_processed = preprocess_data(
    x_train, x_test, y_train, y_test
)

# For faster computation, let's use a subset of the data
# You can increase these numbers for better accuracy but longer training time
TRAIN_SIZE = 10000
TEST_SIZE = 2000

x_train_subset = x_train_processed[:TRAIN_SIZE]
y_train_subset = y_train_processed[:TRAIN_SIZE]
x_test_subset = x_test_processed[:TEST_SIZE]
y_test_subset = y_test_processed[:TEST_SIZE]

print(f"\nUsing subset for faster computation:")
print(f"Training samples: {len(x_train_subset)}")
print(f"Testing samples: {len(x_test_subset)}")


1. LOADING AND PREPROCESSING DATA
----------------------------------------
Number of training images: 60000
Number of testing images: 10000
Image shape: (28, 28)
Number of classes: 10
Original image shape: (28, 28)
Flattened shape: 784
Pixel value range after normalization: [0.0, 1.0]

Using subset for faster computation:
Training samples: 10000
Testing samples: 2000


In [None]:

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# 2. RANDOM FOREST CLASSIFIER
print("\n\n2. RANDOM FOREST CLASSIFIER")
print("-" * 40)



2. RANDOM FOREST CLASSIFIER
----------------------------------------


In [None]:
# Define parameter grid for GridSearchCV
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("Parameter grid for Random Forest:")
for param, values in rf_param_grid.items():
    print(f"  {param}: {values}")


Parameter grid for Random Forest:
  n_estimators: [50, 100, 200]
  max_depth: [10, 20, None]
  min_samples_split: [2, 5, 10]
  min_samples_leaf: [1, 2, 4]


In [None]:
# Create Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)

# Perform Grid Search
print("\nPerforming Grid Search for Random Forest...")
print("This may take a few minutes...")

rf_grid_search = GridSearchCV(
    estimator=rf_classifier,
    param_grid=rf_param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
rf_grid_search.fit(x_train_subset, y_train_subset)

# Get best parameters and model
rf_best_params = rf_grid_search.best_params_
rf_best_model = rf_grid_search.best_estimator_

print(f"\nBest Random Forest Parameters:")
for param, value in rf_best_params.items():
    print(f"  {param}: {value}")
print(f"Best Cross-Validation Score: {rf_grid_search.best_score_:.4f}")


Performing Grid Search for Random Forest...
This may take a few minutes...
Fitting 3 folds for each of 81 candidates, totalling 243 fits

Best Random Forest Parameters:
  max_depth: 20
  min_samples_leaf: 1
  min_samples_split: 2
  n_estimators: 200
Best Cross-Validation Score: 0.9450


In [None]:
# 3. RANDOM FOREST MODEL EVALUATION
print("\n\n3. RANDOM FOREST MODEL EVALUATION")
print("-" * 40)



3. RANDOM FOREST MODEL EVALUATION
----------------------------------------


In [None]:
# Make predictions
rf_predictions = rf_best_model.predict(x_test_subset)

# Calculate metrics
rf_accuracy = accuracy_score(y_test_subset, rf_predictions)
rf_precision = precision_score(y_test_subset, rf_predictions, average='weighted')
rf_recall = recall_score(y_test_subset, rf_predictions, average='weighted')
rf_f1 = f1_score(y_test_subset, rf_predictions, average='weighted')

print("Random Forest Performance Metrics:")
print(f"  Accuracy:  {rf_accuracy:.4f}")
print(f"  Precision: {rf_precision:.4f}")
print(f"  Recall:    {rf_recall:.4f}")
print(f"  F1-Score:  {rf_f1:.4f}")

# Confusion Matrix
rf_cm = confusion_matrix(y_test_subset, rf_predictions)
print(f"\nConfusion Matrix Shape: {rf_cm.shape}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_subset, rf_predictions))

In [None]:
# 4. VISUALIZATIONS
print("\n\n4. CREATING VISUALIZATIONS")
print("-" * 40)



4. CREATING VISUALIZATIONS
----------------------------------------


In [None]:
# Create subplots for visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Random Forest Model Analysis', fontsize=16, fontweight='bold')

# Plot 1: Confusion Matrix
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=range(10), yticklabels=range(10), ax=axes[0,0])
axes[0,0].set_title('Confusion Matrix - Random Forest')
axes[0,0].set_xlabel('Predicted Label')
axes[0,0].set_ylabel('True Label')

# Plot 2: Feature Importance (top 100 features)
rf_feature_importance = rf_best_model.feature_importances_
top_features_idx = np.argsort(rf_feature_importance)[-100:]
top_features_importance = rf_feature_importance[top_features_idx]

axes[0,1].barh(range(len(top_features_importance)), top_features_importance)
axes[0,1].set_title('Top 100 Feature Importances - Random Forest')
axes[0,1].set_xlabel('Importance')
axes[0,1].set_ylabel('Feature Index')

# Plot 3: Feature Importance Heatmap (reshaped to 28x28)
importance_image = rf_feature_importance.reshape(28, 28)
im = axes[1,0].imshow(importance_image, cmap='hot', interpolation='nearest')
axes[1,0].set_title('Feature Importance Heatmap (28x28)')
axes[1,0].set_xlabel('Pixel Column')
axes[1,0].set_ylabel('Pixel Row')
plt.colorbar(im, ax=axes[1,0])

# Plot 4: Sample predictions
sample_indices = np.random.choice(len(x_test_subset), 6, replace=False)
for i, idx in enumerate(sample_indices[:6]):
    row = i // 3
    col = i % 3
    if i < 6:
        # Show original image
        if i == 0:
            axes[1,1].text(0.5, 0.9, 'Sample Predictions', ha='center',
                          transform=axes[1,1].transAxes, fontsize=12, fontweight='bold')

        # Create mini subplot for each sample
        image = x_test[idx].reshape(28, 28)
        true_label = y_test_subset[idx]
        pred_label = rf_predictions[idx]

        # Simple text display of predictions
        prediction_text = f"Sample {i+1}: True={true_label}, Pred={pred_label}"
        axes[1,1].text(0.05, 0.8 - i*0.12, prediction_text,
                      transform=axes[1,1].transAxes, fontsize=10)

axes[1,1].axis('off')
axes[1,1].set_title('Sample Predictions')

plt.tight_layout()
plt.show()

In [None]:
# 5. PREDICTION FUNCTION
print("\n\n5. PREDICTION FUNCTION")
print("-" * 40)

In [None]:
def predict_new_image(model, image_array):
    """
    Predict the class of a new image using the trained model.

    Parameters:
    model: Trained classifier model
    image_array: numpy array of shape (28, 28) or (784,)

    Returns:
    predicted_class: int
    prediction_probability: array
    """
    # Ensure image is in correct format
    if len(image_array.shape) == 2:
        # If 2D (28x28), flatten it
        image_flat = image_array.reshape(1, -1)
    else:
        # If already 1D, reshape for prediction
        image_flat = image_array.reshape(1, -1)

    # Normalize if not already normalized
    if image_flat.max() > 1:
        image_flat = image_flat.astype('float32') / 255.0

    # Make prediction
    predicted_class = model.predict(image_flat)[0]

    # Get prediction probabilities if available
    if hasattr(model, 'predict_proba'):
        prediction_prob = model.predict_proba(image_flat)[0]
        return predicted_class, prediction_prob
    else:
        return predicted_class, None

# Test the prediction function with a sample from test set
test_image_idx = 0
test_image = x_test[test_image_idx]
true_label = y_test[test_image_idx]

predicted_class, pred_proba = predict_new_image(rf_best_model, test_image)

print(f"Test Image Prediction:")
print(f"  True Label: {true_label}")
print(f"  Predicted Label: {predicted_class}")
print(f"  Prediction Correct: {predicted_class == true_label}")

if pred_proba is not None:
    print(f"  Prediction Probabilities:")
    for i, prob in enumerate(pred_proba):
        print(f"    Class {i}: {prob:.4f}")

In [None]:
# 6. SVM CLASSIFIER (BONUS)
print("\n\n6. SVM CLASSIFIER (BONUS)")
print("-" * 40)

In [None]:
# Define parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

print("Parameter grid for SVM:")
for param, values in svm_param_grid.items():
    print(f"  {param}: {values}")

# Create SVM classifier
svm_classifier = SVC(random_state=42, probability=True)

# Perform Grid Search for SVM
print("\nPerforming Grid Search for SVM...")
print("This may take longer than Random Forest...")

svm_grid_search = GridSearchCV(
    estimator=svm_classifier,
    param_grid=svm_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
svm_grid_search.fit(x_train_subset, y_train_subset)

# Get best parameters and model
svm_best_params = svm_grid_search.best_params_
svm_best_model = svm_grid_search.best_estimator_

print(f"\nBest SVM Parameters:")
for param, value in svm_best_params.items():
    print(f"  {param}: {value}")
print(f"Best Cross-Validation Score: {svm_grid_search.best_score_:.4f}")

In [None]:

# 7. SVM MODEL EVALUATION
print("\n\n7. SVM MODEL EVALUATION")
print("-" * 40)

In [None]:
# Make predictions
svm_predictions = svm_best_model.predict(x_test_subset)

# Calculate metrics
svm_accuracy = accuracy_score(y_test_subset, svm_predictions)
svm_precision = precision_score(y_test_subset, svm_predictions, average='weighted')
svm_recall = recall_score(y_test_subset, svm_predictions, average='weighted')
svm_f1 = f1_score(y_test_subset, svm_predictions, average='weighted')

print("SVM Performance Metrics:")
print(f"  Accuracy:  {svm_accuracy:.4f}")
print(f"  Precision: {svm_precision:.4f}")
print(f"  Recall:    {svm_recall:.4f}")
print(f"  F1-Score:  {svm_f1:.4f}")

In [None]:
# 8. MODEL COMPARISON
print("\n\n8. MODEL COMPARISON")
print("-" * 40)

comparison_data = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    'Random Forest': [rf_accuracy, rf_precision, rf_recall, rf_f1],
    'SVM': [svm_accuracy, svm_precision, svm_recall, svm_f1]
}

print("Performance Comparison:")
print(f"{'Metric':<12} {'Random Forest':<15} {'SVM':<15} {'Winner':<10}")
print("-" * 55)

for i, metric in enumerate(comparison_data['Metric']):
    rf_score = comparison_data['Random Forest'][i]
    svm_score = comparison_data['SVM'][i]
    winner = 'Random Forest' if rf_score > svm_score else 'SVM' if svm_score > rf_score else 'Tie'
    print(f"{metric:<12} {rf_score:<15.4f} {svm_score:<15.4f} {winner:<10}")

# Create comparison visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Performance comparison bar chart
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
rf_scores = [rf_accuracy, rf_precision, rf_recall, rf_f1]
svm_scores = [svm_accuracy, svm_precision, svm_recall, svm_f1]

x = np.arange(len(metrics))
width = 0.35

ax1.bar(x - width/2, rf_scores, width, label='Random Forest', alpha=0.8)
ax1.bar(x + width/2, svm_scores, width, label='SVM', alpha=0.8)
ax1.set_xlabel('Metrics')
ax1.set_ylabel('Score')
ax1.set_title('Model Performance Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(metrics)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Confusion matrices comparison
svm_cm = confusion_matrix(y_test_subset, svm_predictions)
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=range(10), yticklabels=range(10), ax=ax2)
ax2.set_title('Confusion Matrix - SVM')
ax2.set_xlabel('Predicted Label')
ax2.set_ylabel('True Label')

plt.tight_layout()
plt.show()



In [None]:
# 9. FINAL SUMMARY
print("\n\n9. FINAL SUMMARY")
print("=" * 60)
print("PROJECT COMPLETED SUCCESSFULLY!")
print(f"✓ Dataset: MNIST (Training: {TRAIN_SIZE}, Testing: {TEST_SIZE})")
print(f"✓ Preprocessing: Normalization and flattening completed")
print(f"✓ Random Forest: Best accuracy = {rf_accuracy:.4f}")
print(f"✓ SVM: Best accuracy = {svm_accuracy:.4f}")
print(f"✓ Winner: {'Random Forest' if rf_accuracy > svm_accuracy else 'SVM' if svm_accuracy > rf_accuracy else 'Tie'}")
print(f"✓ Visualizations: Confusion matrices, feature importance, and comparisons")
print(f"✓ Prediction function: Ready for new image classification")
print("=" * 60)