# Assignment 2 Project A: Colon Cancer Image Classification

## Setup & Imports

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Directory paths
main = '../Image_classification_data/data_labels_mainData.csv'
extra = '../Image_classification_data/data_labels_extraData.csv'
img_dir = '../Image_classification_data/patch_images'


## 1. Load & Inspect Labels
### 1.1 Data Exploration and Understanding

#### Class Imbalance Identification

In [None]:
df_main = pd.read_csv(main)
df_extra = pd.read_csv(extra)

counts = df_main['isCancerous'].value_counts().sort_index()

class_names = ['Non-Cancerous', 'Cancerous']
counts.index = class_names

plt.figure(figsize=(6, 4))
bars = plt.bar(counts.index, counts.values)
plt.title('Distribution of isCancerous Classes')
plt.xlabel('Class')
plt.ylabel('Number of Images')

plt.tight_layout()
plt.show()

counts = df_main['cellTypeName'].value_counts().sort_index()

plt.figure(figsize=(6, 4))
bars = plt.bar(counts.index, counts.values)
plt.title('Distribution of Cell Types')
plt.xlabel('Cell Type')
plt.ylabel('Number of Images')

plt.tight_layout()
plt.show()

- The above represents the class imbalances for isCancerous and cellTypeName, visualised through the bar graphs. The isCancerous class shows a clear imbalance in the count, as there are ~50% more non-cancerous compared to the cancerous.
- As for the cell types there is a clear disparancy between the **epithelial** cells and the rest, having twice the count compared to **fibroblast** and **others**. Also, being ~60% greater than the **inflammatory** cell.
- The difference in the count would lead to misleading accuracy towards the non-cancerous as it represents the majority of data and epithelial for the cell types. For data with fewer cases, models would tend to have low recall as there may not be enough information to be able to tell features apart leadning to worse generalisation.  



#### Exploratory Data Analysis (EDA)

In [None]:
df_main.head()

In [None]:
df_main.info()

In [None]:
df_main.isna()

In [None]:
# --- Option 3: Images-Per-Patient Histogram ---

counts_patient = df_main['patientID'].value_counts()

plt.figure(figsize=(6, 4))
plt.hist(counts_patient, bins=20, edgecolor='black')
plt.title('Number of Patches per Patient')
plt.xlabel('Number of Patches')
plt.ylabel('Number of Patients')
plt.tight_layout()
plt.show()

In [None]:
# Build the crosstab
crosstab = pd.crosstab(df_main['cellTypeName'], df_main['isCancerous'])
crosstab.columns = ['Non-Cancerous', 'Cancerous']

# Plot stacked bar with proper labels
ax = crosstab.plot(kind='bar', stacked=True, figsize=(8, 5))
ax.set_xlabel('Cell Type')            # Human-readable x-axis label
ax.set_ylabel('Number of Images')
ax.set_title('Cell Type vs. Cancer Status')
plt.xticks(rotation=45)
plt.legend(title='Cancer Status')
plt.tight_layout()
plt.show()

#### Justification of Data Handling Methods

### 1.2 Evaluation Framework 
#### Performance Metrics Selection

- Macro F1-Score as 

In [None]:
# TODO: Visualize class imbalance for 'isCancerous'
# TODO: Visualize multiclass distribution for 'cellTypeName'
# TODO: Display sample images from each class


## 4. Evaluation Framework

In [None]:
# TODO: Split data by patient into train/val/test
# Ensure no leakage across splits
# TODO: Define evaluation metrics (e.g., macro-F1, precision, recall)


## 5. Model Baselines

### 5.1 ANN Baseline

In [None]:
# TODO: Define and compile a fully-connected ANN model


### 5.2 CNN Baseline

In [None]:
# TODO: Define and compile a CNN model


### 5.3 Decision Tree Baseline

In [None]:
# TODO: Define and train a DecisionTreeClassifier on flattened images


## 6. Optimization & Hyperparameter Tuning

In [None]:
# TODO: Plot learning curves to identify overfitting/underfitting
# TODO: Apply regularization techniques (dropout, weight decay)
# TODO: Use validation set for hyperparameter tuning


## 7. Final Performance & Robustness

In [None]:
# TODO: Evaluate chosen model on test set
# TODO: Assess robustness across patient subsets


## 8. Independent Evaluation

In [None]:
# TODO: Compare model performance to literature (e.g., Sirinukunwattana et al. 2016)//


## 9. Conclusion & Ultimate Judgment

## Appendix
Detailed hyperparameters, extended results, and additional literature notes.