# Libraries

In [2]:
from PIL import Image, ImageFilter
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import matplotlib.pyplot as plt
from pathlib import Path
import random
import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xg
import cv2
import seaborn as sns
import matplotlib.pyplot as plt

#Pre-Processing

## Ingest Data

In [None]:
base_path = "data/100"

# Get tumour file paths and shuffle
tumour_files = []
tumour_dirs = [
    "Invasive_Tumor",
    "Prolif_Invasive_Tumor",
    "T_Cell_and_Tumor_Hybrid"
]

for dir_name in tumour_dirs:
    dir_path = os.path.join(base_path, dir_name)
    if os.path.isdir(dir_path):
        files = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
        tumour_files.extend(files)

random.shuffle(tumour_files)

# Get immune file paths and shuffle
immune_files = []
immune_dirs = [
    "CD4+_T_Cells", "CD4+_T_Cells",
    "CD8+_T_Cells",
    "B_Cells",
    "Mast_Cells",
    "Macrophages_1",
    "Macrophages_2",
    "LAMP3+_DCs",
    "IRF7+_DCs"
]

for dir_name in immune_dirs:
    dir_path = os.path.join(base_path, dir_name)
    if os.path.isdir(dir_path):
        files = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
        immune_files.extend(files)

random.shuffle(immune_files)


# Get stromal file paths and shuffle
stromal_files = []
stromal_dirs = [
    "Stromal",
    "Stromal_and_T_Cell_Hybrid",
    "Perivascular-Like"
]

for dir_name in stromal_dirs:
    dir_path = os.path.join(base_path, dir_name)
    if os.path.isdir(dir_path):
        files = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
        stromal_files.extend(files)

random.shuffle(stromal_files)

# Get other file paths and shuffle
other_files = []
other_dirs = [
    "Endothelial",
    "Myoepi_ACTA2+",
    "Myoepi_KRT15+",
    "DCIS_1",
    "DCIS_2",
    "Unlabeled"
]

for dir_name in stromal_dirs:
    dir_path = os.path.join(base_path, dir_name)
    if os.path.isdir(dir_path):
        files = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
        other_files.extend(files)

random.shuffle(other_files)

## Resizing

In [None]:
def load_resize(img_path, size=(224,224)):
    img = Image.open(img_path).convert('RGB')
    img = img.resize(size)
    return np.array(img)

tumour_imgs = [load_resize(f) for f in tumour_files]
print("tumour loaded")

immune_imgs = [load_resize(f) for f in immune_files]
print("immune loaded")

stromal_imgs = [load_resize(f) for f in stromal_files]
print("stromal loaded")

other_imgs = [load_resize(f) for f in other_files]
print("other loaded")

## Test Train Split

In [None]:
# Train using 80% of data from each group
'''
tumour_train_ind = int(0.8 * len(tumour_imgs))
tumour_test_ind = int(0.2 * len(tumour_imgs))

immune_train_ind = int(0.8 * len(immune_imgs))
immune_test_ind = int(0.2 * len(immune_imgs))

stromal_train_ind = int(0.8 * len(stromal_imgs))
stromal_test_ind = int(0.2 * len(stromal_imgs))

other_train_ind = int(0.8 * len(other_imgs))
other_test_ind = int(0.2 * len(other_imgs))
'''

tumour_train_ind = 1000
tumour_test_ind = 1200

immune_train_ind = 1000
immune_test_ind = 1200

stromal_train_ind = 1000
stromal_test_ind = 1200

other_train_ind = 1000
other_test_ind = 1200

imgs_train = immune_imgs[:immune_train_ind] + tumour_imgs[:tumour_train_ind] + stromal_imgs[:stromal_train_ind] + other_imgs[:other_train_ind]
imgs_test = immune_imgs[immune_train_ind:immune_test_ind] + tumour_imgs[tumour_train_ind:tumour_test_ind] + stromal_imgs[stromal_train_ind:stromal_test_ind] + other_imgs[other_train_ind:other_test_ind]

Xmat_train = np.stack(imgs_train, axis=0)
Xmat_test = np.stack(imgs_test, axis=0)

y_train = ['Immune'] * immune_train_ind + ['Tumour'] * tumour_train_ind + ['Stromal'] * stromal_train_ind + ['Other'] * other_train_ind
y_test = ['Immune'] * immune_test_ind + ['Tumour'] * tumour_test_ind + ['Stromal'] * stromal_test_ind + ['Other'] * other_test_ind

## Transformations

In [None]:
def apply_blur(images):
    return np.array([cv2.GaussianBlur(img, (5, 5), 0) for img in images])

def apply_stretch(images):
    stretched = []
    for img in images:
        h, w, c = img.shape
        new_w = int(w * 1.2)  # stretch width by 20%
        img_stretched = cv2.resize(img, (new_w, h))
        img_cropped = img_stretched[:, :w, :]  # crop back to original width
        stretched.append(img_cropped)
    return np.array(stretched)

def apply_greyscale(images):
    greyscale = []
    for img in images:
        grey = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        grey_3ch = cv2.cvtColor(grey, cv2.COLOR_GRAY2RGB)
        greyscale.append(grey_3ch)
    return np.array(greyscale)

def apply_rotation(images):
    rotated = []
    for img in images:
        (h, w) = img.shape[:2]
        center = (w // 2, h // 2)
        matrix = cv2.getRotationMatrix2D(center, 15, 1.0)  # rotate 15 degrees
        rotated_img = cv2.warpAffine(img, matrix, (w, h), borderMode=cv2.BORDER_REFLECT)
        rotated.append(rotated_img)
    return np.array(rotated)

Xmat_train_original = Xmat_train
Xmat_train_blur = apply_blur(Xmat_train)
Xmat_train_stretch = apply_stretch(Xmat_train)
Xmat_train_greyscale = apply_greyscale(Xmat_train)
Xmat_train_rotate = apply_rotation(Xmat_train)

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Flatten images for XGBoost (224x224x3 -> 150528 features)
def flatten_images(images):
  return images.reshape(images.shape[0], -1)
X_train_flat = flatten_images(Xmat_train_original)
X_test_flat = flatten_images(Xmat_test)

# For augmented data
X_train_blur_flat = flatten_images(Xmat_train_blur)
X_train_stretch_flat = flatten_images(Xmat_train_stretch)
X_train_greyscale_flat = flatten_images(Xmat_train_greyscale)
X_train_rotate_flat = flatten_images(Xmat_train_rotate)

#XGBoost Model

## Standard

In [5]:
#Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
    )

# Train model
xgb_model.fit(X_train_flat, y_train_enc)

# Evaluate
y_pred = xgb_model.predict(X_test_flat)

accuracy = accuracy_score(y_test_enc, y_pred)
f1 = f1_score(y_test_enc, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test_enc, y_pred)

print(f"Original Accuracy: {accuracy:.2%}")
print(f"Original F1: {f1:.2%}")

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

NameError: name 'xgb' is not defined

## Blur

In [6]:
# Train with blur augmentation
xgb_model_blur = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
    )

xgb_model_blur.fit(X_train_blur_flat, y_train_enc)

# Evaluate
y_pred_blur = xgb_model_blur.predict(X_test_flat)

accuracy_blur = accuracy_score(y_test_enc, y_pred_blur)
f1_blur = f1_score(y_test_enc, y_pred_blur, average='weighted')
conf_matrix_blur = conf_matrix(y_test_enc, y_pred_blur)

print(f"Blur Accuracy: {accuracy_blur:.2%}")
print(f"Blur F1: {f1_blur:.2%}")

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_blur, annot=True, fmt='g', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

NameError: name 'xgb' is not defined

## Stretch

In [3]:
# Train with stretch augmentation
xgb_model_stretch = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
    )

xgb_model_stretch.fit(X_train_stretch_flat, y_train_enc)

# Evaluate
y_pred_stretch = xgb_model_stretch.predict(X_test_flat)

accuracy_stretch = accuracy_score(y_test_enc, y_pred_stretch)
f1_stretch = f1_score(y_test_enc, y_pred_stretch, average='weighted')
conf_matrix_stretch = conf_matrix(y_test_enc, y_pred_stretch)

print(f"Stretch Accuracy: {accuracy_stretch:.2%}")
print(f"Stretch F1: {f1_stretch:.2%}")

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_stretch, annot=True, fmt='g', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

'print(f"Stretch Accuracy: {accuracy_stretch:.2%}")'

## Greyscale

In [4]:
# Train with greyscale augmentation
xgb_model_greyscale = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
    )

xgb_model_greyscale.fit(X_train_greyscale_flat, y_train_enc)\n

# Evaluate
y_pred_greyscale = xgb_model_greyscale.predict(X_test_flat)

accuracy_greyscale = accuracy_score(y_test_enc, y_pred_greyscale)
f1_greyscale = f1_score(y_test_enc, y_pred_greyscale, average='weighted')
conf_matrix_greyscale = conf_matrix(y_test_enc, y_pred_greyscale)

print(f"Greyscale Accuracy: {accuracy_greyscale:.2%}")
print(f"Greyscale F1: {f1_greyscale:.2%}")

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_greyscale, annot=True, fmt='g', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

'print(f"Greyscale Accuracy: {accuracy_greyscale:.2%}")'

## Rotation

In [None]:
# Train with rotation augmentation
xgb_model_rotate = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
    )

xgb_model_rotate.fit(X_train_rotate_flat, y_train_enc)

# Evaluate
y_pred_rotate = xgb_model_rotate.predict(X_test_flat)

accuracy_rotate = accuracy_score(y_test_enc, y_pred_rotate)
f1_rotate = f1_score(y_test_enc, y_pred_rotate, average='weighted')
conf_matrix_rotation = conf_matrix(y_test_enc, y_pred_rotate)

print(f"Rotation Accuracy: {accuracy_rotate:.2%}")
print(f"Rotation F1: {f1_rotate:.2%}")

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_rotation, annot=True, fmt='g', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'], yticklabels=['Class 0', 'Class 1', 'Class 2', 'Class 3'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()