# RF PCA Model

## Preprocessing

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import json
import data_preprocessing
import pandas as pd
from PIL import Image, ImageEnhance
from skimage.feature import hog
from sklearn.decomposition import PCA
from IPython.display import display
from sklearn.metrics import classification_report
import joblib

import warnings
from sklearn.exceptions import UndefinedMetricWarning
# Suppress specific warning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# 0 Immune, 1 Other, 2 Stromal, 3 Tumour
Xmat_train, Xmat_val, Xmat_tests_0, Xmat_tests_1, Xmat_tests_2, y_train_enc, y_val_enc, y_tests_enc_0, y_tests_enc_1, y_tests_enc_2 = data_preprocessing.load_split_images()

# reassigning to just use first one for shiny
Xmat_test = Xmat_tests_0
y_test_enc = y_tests_enc_0

def store_results(results, y_true, y_pred, probs=None, blur=0, noise=0, hog=False, pca=False, **extra_metrics):        
    entry = {
        "blur_size": blur,
        "noise_level": noise,
        #"HOG": hog,
        #"PCA": pca, # only doing the PCA one for shiny
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, average='macro'),
        "precision": precision_score(y_true, y_pred, average='macro'),
        "recall": recall_score(y_true, y_pred, average='macro'),
    }

    f1_classes = f1_score(y_true, y_pred, average=None, zero_division=0)
    precision_classes = precision_score(y_true, y_pred, average=None, zero_division=0)
    recall_classes = recall_score(y_true, y_pred, average=None, zero_division=0)

    label_map = {
        0: "immune",
        1: "other",
        2: "stromal",
        3: "tumour"
    }
    
    for i in range(4):
        label = label_map[i]
        entry[f"f1_{label}"] = f1_classes[i]
        entry[f"precision_{label}"] = precision_classes[i]
        entry[f"recall_{label}"] = recall_classes[i]
    
    if probs is not None:
        confidences = np.max(probs, axis=1)
        entry["confidence_overall"] = np.mean(confidences)
        for cls in range(4):
            label = label_map[cls]
            cls_conf = confidences[y_pred == cls]
            entry[f"confidence_{label}_avg"] = np.mean(cls_conf) if len(cls_conf) > 0 else np.nan
            entry[f"confidence_{label}_std"] = np.std(cls_conf) if len(cls_conf) > 0 else np.nan
    
    total_preds = len(y_pred)
    for cls in range(4):
        label = label_map[cls]
        entry[f"count_pred_{label}"] = np.sum(y_pred == cls)

    # Store actual and predicted labels and the confusion matrix as strings
    entry["confusion_matrix"] = str(confusion_matrix(y_true, y_pred, labels=[0, 1, 2, 3]).tolist())

    entry.update(extra_metrics)
    results.append(entry)

def extract_hog_features(images):
    hog_features = []
    for img in images:
        if img.shape[-1] == 3:
            img = Image.fromarray((img * 255).astype(np.uint8)).convert("L")
            img = np.array(img)
        features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features.append(features)
    return np.array(hog_features)

# use shiny version (the same but cleaner), and same for hog thing if possible?
def apply_noise(images, mean=0, std=10, seed=3888):
    if std > 0:
        np.random.seed(seed)  # Consistent noise for all images in this batch
        noise = np.random.normal(mean, std, images.shape)
        noisy_images = images + noise
        noisy_images = np.clip(noisy_images, 0, 255).astype(np.uint8)
        return noisy_images
    else:
        return images

X_train_flat = Xmat_train.reshape(Xmat_train.shape[0], -1)
X_val_flat   = Xmat_val.reshape(Xmat_val.shape[0], -1)
X_test_flat  = Xmat_test.reshape(Xmat_test.shape[0], -1)

pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_flat)

[('Immune', 0), ('Other', 1), ('Stromal', 2), ('Tumour', 3)]


## RF PCA Model Training

In [22]:
model_pca = RandomForestClassifier(n_estimators=100, random_state=3888)
model_pca.fit(X_train_pca, y_train_enc)

# baseline
X_test_pca = pca.transform(X_test_flat)
y_pred = model_pca.predict(X_test_pca)
y_probs = model_pca.predict_proba(X_test_pca)
store_results(results, y_test_enc, y_pred, probs=y_probs)

for radius in [1, 3, 5, 7, 9, 19]: # blur
    X_blur = data_preprocessing.apply_blur(Xmat_test, radius)
    X_test_flat = X_blur.reshape(X_blur.shape[0], -1)
    X_test_pca = pca.transform(X_test_flat)
    y_pred = model_pca.predict(X_test_pca)
    y_probs = model_pca.predict_proba(X_test_pca)
    store_results(results, y_test_enc, y_pred, blur=radius, probs=y_probs)

for noise_std in [1, 3, 5, 10, 20, 30]:
    seed = 3888 + noise_std  # same seed logic as baseline
    X_noisy = apply_noise(Xmat_test, std=noise_std, seed=seed)
    X_test_flat = X_noisy.reshape(X_noisy.shape[0], -1)
    X_test_pca = pca.transform(X_test_flat)
    y_pred = model_pca.predict(X_test_pca)
    y_probs = model_pca.predict_proba(X_test_pca)
    store_results(results, y_test_enc, y_pred, noise=noise_std, probs=y_probs)

for radius in [1, 3, 5, 7, 9, 19]: # blur
    for noise_std in [1, 3, 5, 10, 20, 30]:
        combo_seed = 10000 + (radius * 100) + noise_std
        X_blur = data_preprocessing.apply_blur(Xmat_test, radius)
        X_combo = apply_noise(X_blur, std=noise_std, seed=combo_seed)
        X_test_flat = X_combo.reshape(X_combo.shape[0], -1)
        X_test_pca = pca.transform(X_test_flat)
        y_pred = model_pca.predict(X_test_pca)
        y_probs = model_pca.predict_proba(X_test_pca)
        store_results(results, y_test_enc, y_pred, blur=radius, noise=noise_std, probs=y_probs)

df = pd.DataFrame(results)
df.to_csv("rf_augmented_metrics.csv", index=False)

## Saving PCA and RF PCA Model for Shiny

In [35]:
model_pca = RandomForestClassifier(n_estimators=100, random_state=3888)
model_pca.fit(X_train_pca, y_train_enc)
joblib.dump(pca, "Base_pca.joblib")
joblib.dump(model_pca, "rf_pca_model.joblib")  

['rf_pca_model.joblib']