## RF Model

In [3]:
%load_ext autoreload
%autoreload 2

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import json

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'evaluation')))
import data_preprocessing

#import importlib
#importlib.reload(data_preprocessing)
#print("[DEBUG] Loaded from:", data_preprocessing.__file__)

import pandas as pd
from PIL import Image, ImageEnhance
from skimage.feature import hog
from sklearn.decomposition import PCA
from IPython.display import display
from sklearn.metrics import classification_report

# suppress warnings for results
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

Xmat_train, Xmat_val, Xmat_tests_0, Xmat_tests_1, Xmat_tests_2, y_train_enc, y_val_enc, y_tests_enc_0, y_tests_enc_1, y_tests_enc_2 = data_preprocessing.load_split_images()

Total image counts:
  Tumour: 38763
  Immune: 42598
  Stromal: 42878
  Other: 43519
After label_and_split:
  Tumour split: 2500 500 3000
  Immune split: 2500 500 3000
  Stromal split: 2500 500 3000
  Other split: 2500 500 3000
[('Immune', 0), ('Other', 1), ('Stromal', 2), ('Tumour', 3)]


In [4]:
def store_results(results, y_true, y_pred, probs=None, blur=0, noise=0, hog=False, pca=False, **extra_metrics):        
    entry = {
        "blur_size": blur,
        "noise_level": noise,
        "HOG": hog,
        "PCA": pca,
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, average='macro'),
        "precision": precision_score(y_true, y_pred, average='macro'),
        "recall": recall_score(y_true, y_pred, average='macro'),
    }

    f1_classes = f1_score(y_true, y_pred, average=None, zero_division=0)
    precision_classes = precision_score(y_true, y_pred, average=None, zero_division=0)
    recall_classes = recall_score(y_true, y_pred, average=None, zero_division=0)

    label_map = {
        0: "immune",
        1: "other",
        2: "stromal",
        3: "tumour"
    }
    
    for i in range(4):
        label = label_map[i]
        entry[f"f1_{label}"] = f1_classes[i]
        entry[f"precision_{label}"] = precision_classes[i]
        entry[f"recall_{label}"] = recall_classes[i]
    
    if probs is not None:
        confidences = np.max(probs, axis=1)
        entry["confidence_overall"] = np.mean(confidences)
        for cls in range(4):
            label = label_map[cls]
            cls_conf = confidences[y_pred == cls]
            entry[f"confidence_{label}_avg"] = np.mean(cls_conf) if len(cls_conf) > 0 else np.nan
            entry[f"confidence_{label}_std"] = np.std(cls_conf) if len(cls_conf) > 0 else np.nan
    
    total_preds = len(y_pred)
    for cls in range(4):
        label = label_map[cls]
        entry[f"count_pred_{label}"] = np.sum(y_pred == cls)

    # store actual and predicted labels and the confusion matrix as strings
    entry["confusion_matrix"] = str(confusion_matrix(y_true, y_pred, labels=[0, 1, 2, 3]).tolist())

    entry.update(extra_metrics)
    results.append(entry)

def extract_hog_features(images):
    hog_features = []
    for img in images:
        if img.shape[-1] == 3:
            img = Image.fromarray((img * 255).astype(np.uint8)).convert("L")
            img = np.array(img)
        features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features.append(features)
    return np.array(hog_features)

# preprocessing
X_train_flat = Xmat_train.reshape(Xmat_train.shape[0], -1)
X_val_flat = Xmat_val.reshape(Xmat_val.shape[0], -1)
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_flat)
X_train_hog = extract_hog_features(Xmat_train)

# fitting models
model_base = RandomForestClassifier(n_estimators=100, random_state=3888)
model_base.fit(X_train_flat, y_train_enc)

model_pca = RandomForestClassifier(n_estimators=100, random_state=3888)
model_pca.fit(X_train_pca, y_train_enc)

model_hog = RandomForestClassifier(n_estimators=100, random_state=3888)
model_hog.fit(X_train_hog, y_train_enc)

# helper functions, model map
def evaluate_and_store(model, X, y_true, model_label, test_set, probs=True, **kwargs):
    y_pred = model.predict(X)
    y_probs = model.predict_proba(X) if probs else None
    store_results(results, y_true, y_pred, probs=y_probs, model_label=model_label, test_set=test_set, **kwargs)

models = {
    "base": (model_base, lambda X: X.reshape(X.shape[0], -1), {}),
    "pca": (model_pca, lambda X: pca.transform(X.reshape(X.shape[0], -1)), {"pca": True}),
    "hog": (model_hog, extract_hog_features, {"hog": True}),
}

results = []

def run_all_variants(X_base, y_base, test_set_name):
    blur_levels = [0, 1, 3, 5, 7, 9, 19]
    noise_levels = [0, 1, 3, 5, 10, 20, 30]

    # Unmodified input (baseline)
    print("Evaluating baseline (no blur, no noise)...")
    for name, (model, transform, flags) in models.items():
        print(f"Model: {name}")
        X_trans = transform(X_base)
        evaluate_and_store(
            model,
            X_trans,
            y_base,
            model_label=name.upper(),
            test_set=test_set_name,
            blur=0,
            noise=0,
            **flags
        )

    # Blur only
    for blur in blur_levels:
        print(f"Evaluating blur={blur}, noise=0...")
        X_blur = data_preprocessing.apply_blur(X_base, blur)
        for name, (model, transform, flags) in models.items():
            print(f"Model: {name}")
            X_trans = transform(X_blur)
            evaluate_and_store(
                model,
                X_trans,
                y_base,
                model_label=name.upper(),
                test_set=test_set_name,
                blur=blur,
                noise=0,
                **flags
            )

    # Noise only
    for noise in noise_levels:
        print(f"Evaluating blur=0, noise={noise}...")
        np.random.seed(3888 + noise)
        X_noisy = data_preprocessing.apply_noise(X_base, std=noise)
        for name, (model, transform, flags) in models.items():
            print(f"Model: {name}")
            X_trans = transform(X_noisy)
            evaluate_and_store(
                model,
                X_trans,
                y_base,
                model_label=name.upper(),
                test_set=test_set_name,
                blur=0,
                noise=noise,
                **flags
            )

    # Blur + noise 
    for blur in blur_levels:
        for noise in noise_levels:
            print(f"Evaluating blur={blur}, noise={noise}...")
            np.random.seed(10000 + blur * 100 + noise)
            X_blur = data_preprocessing.apply_blur(X_base, blur)
            X_combo = data_preprocessing.apply_noise(X_blur, std=noise)
            for name, (model, transform, flags) in models.items():
                print(f"Model: {name}")
                X_trans = transform(X_combo)
                evaluate_and_store(
                    model,
                    X_trans,
                    y_base,
                    model_label=name.upper(),
                    test_set=test_set_name,
                    blur=blur,
                    noise=noise,
                    **flags
                )

test_sets = [
    (0, Xmat_tests_0, y_tests_enc_0),
    (1, Xmat_tests_1, y_tests_enc_1),
    (2, Xmat_tests_2, y_tests_enc_2),
]

for name, X, y in test_sets:
    print(f"\nRunning evaluations on test set: {name}")
    run_all_variants(X, y, test_set_name=name)


Running evaluations on test set: 0
Evaluating baseline (no blur, no noise)...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=1, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=3, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=5, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=7, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=9, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=19, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=0...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=1...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=3...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=5...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=10...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=20...
Model: base
Model: pca
Model: hog
Evaluating blur=0, noise=30...
Model: base
Model: pc

NameError: name '__file__' is not defined

In [6]:
BASE_DIR = os.getcwd()
METRICS_PATH = os.path.join(BASE_DIR, "..", "metrics", "rf_augmented_metrics.csv")

res = pd.DataFrame(results)
res.to_csv(METRICS_PATH, index=False)

## Saving PCA and RF PCA Model for Shiny

In [9]:
import joblib

model_pca = RandomForestClassifier(n_estimators=100, random_state=3888)
model_pca.fit(X_train_pca, y_train_enc)

# absolute path of the current script
script_dir = os.getcwd()

# fefine save paths in models/ (this script's directory) and sibling app/ directory
model_filename = "rf_pca_model.joblib"
pca_filename = "Base_pca.joblib"

# save in current models/ directory
models_path = os.path.join(script_dir, model_filename)
pca_path = os.path.join(script_dir, pca_filename)

# save in sibling app/ directory
app_dir = os.path.abspath(os.path.join(script_dir, "..", "app"))
app_model_path = os.path.join(app_dir, model_filename)
app_pca_path = os.path.join(app_dir, pca_filename)

# dumping files to both locations
joblib.dump(model_pca, models_path)
joblib.dump(pca, pca_path)
joblib.dump(model_pca, app_model_path)
joblib.dump(pca, app_pca_path)

['/Users/Elise/Documents/GitHub/data3888-14/app/Base_pca.joblib']