In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import math
import zipfile
from PIL import Image

from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, save_img

In [2]:
extracted_tma_dir = '../TMA_images'
augmented_tma_dir = './augmented-tma-2'

# Update the original_tma_dir to point to the directory with extracted TMA images
original_tma_dir = os.path.join(extracted_tma_dir, 'train_images')

In [3]:
combined_tma = pd.read_csv("./augmented-tma/combined_tma.csv")

In [4]:
# Convert 'image_id' column to string type
combined_tma['image_id'] = combined_tma['image_id'].astype(str)

# Separate original images from augmented ones
original_images = combined_tma[~combined_tma['image_id'].str.contains('_')]
augmented_images = combined_tma[combined_tma['image_id'].str.contains('_')]

# Ensure balanced selection of original images for the test set
# Here, we ensure we select at least one image from each class
test_orig = original_images.groupby('label').sample(n=1, random_state=42)

# Add augmented versions of these selected originals to the test set
test_aug = augmented_images[augmented_images['image_id'].str.split('_').str[0].isin(test_orig['image_id'])]
test_set = pd.concat([test_orig, test_aug])

# The training set consists of all images not in the test set
train_set = combined_tma[~combined_tma['image_id'].isin(test_set['image_id'])]

In [None]:
# Create a label to index mapping
label_to_idx = {label: idx for idx, label in enumerate(train_set['label'].unique())}

# Apply this mapping to your train and test sets
train_set['label_idx'] = train_set['label'].map(label_to_idx)
test_set['label_idx'] = test_set['label'].map(label_to_idx)

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class TissueDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['path']
        image = Image.open(img_path)
        label = self.dataframe.iloc[idx]['label_idx']

        if self.transform:
            image = self.transform(image)

        return image, label


# Define transformations
transform = transforms.Compose([
    transforms.Resize((512, 512)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [7]:
# Reverse mapping from index to original label
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

In [8]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import classification_report

def evaluate_fold(model, test_loader, label_to_idx):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    # Convert indices back to original labels
    predicted_labels = [idx_to_label[idx] for idx in predictions]
    actual_labels = [idx_to_label[idx] for idx in actuals]

    # Calculate metrics
    fold_metrics = {}
    fold_metrics['classification_report'] = classification_report(actual_labels, predicted_labels, zero_division=0)
    fold_metrics['confusion_matrix'] = confusion_matrix(actual_labels, predicted_labels)

    return fold_metrics

In [9]:
def aggregate_fold_results(fold_results):
    # Aggregate results from all folds
    # You can customize this function based on how you want to aggregate
    # For simplicity, this example just collects all reports in a list
    aggregated_results = {
        'classification_reports': [fold['classification_report'] for fold in fold_results],
        'confusion_matrices': [fold['confusion_matrix'] for fold in fold_results]
    }
    return aggregated_results

In [10]:
import timm
import torch.nn as nn
import torch.optim as optim

# Assuming combined_tma is your complete dataset
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Placeholder for fold results
fold_results = []

# Iterate over each fold
for train_index, test_index in kfold.split(combined_tma):
    # Split data into training and testing sets for this fold
    train_fold, test_fold = combined_tma.iloc[train_index], combined_tma.iloc[test_index]

    # Create datasets
    train_dataset = TissueDataset(train_set, transform=transform)
    test_dataset = TissueDataset(test_set, transform=transform)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Load a pre-trained model
    model = timm.create_model('resnet50', pretrained=True)

    # Modify the classifier
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 5)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    num_epochs = 4

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    # Convert indices back to original labels
    predicted_labels = [idx_to_label[idx] for idx in predictions]
    actual_labels = [idx_to_label[idx] for idx in actuals]

    print(classification_report(actual_labels, predicted_labels, zero_division=0))

    # Evaluate the model for this fold
    fold_metrics = evaluate_fold(model, test_loader, label_to_idx)
    fold_results.append(fold_metrics)

# Aggregate results from all folds
overall_results = aggregate_fold_results(fold_results)

for i, report in enumerate(overall_results['classification_reports']):
    print(f"Classification Report for Fold {i+1}:\n{report}\n")

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 1/4, Loss: 1.4053477048873901
Epoch 2/4, Loss: 0.5859560966491699
Epoch 3/4, Loss: 0.08491836488246918
Epoch 4/4, Loss: 0.005249147769063711
              precision    recall  f1-score   support

          CC       1.00      1.00      1.00        11
          EC       0.92      1.00      0.96        11
        HGSC       0.55      1.00      0.71        11
        LGSC       0.00      0.00      0.00        11
          MC       0.83      0.91      0.87        11

    accuracy                           0.78        55
   macro avg       0.66      0.78      0.71        55
weighted avg       0.66      0.78      0.71        55

Epoch 1/4, Loss: 1.311478853225708
Epoch 2/4, Loss: 0.6833809614181519
Epoch 3/4, Loss: 0.15726317465305328
Epoch 4/4, Loss: 0.010780763812363148
              precision    recall  f1-score   support

          CC       1.00      1.00      1.00        11
          EC       0.31      0.45      0.37        11
        HGSC       0.41      1.00      0.58        11
 

In [11]:
for i, report in enumerate(overall_results['classification_reports']):
    print(f"Classification Report for Fold {i+1}:\n{report}\n")

Classification Report for Fold 1:
              precision    recall  f1-score   support

          CC       1.00      1.00      1.00        11
          EC       0.92      1.00      0.96        11
        HGSC       0.55      1.00      0.71        11
        LGSC       0.00      0.00      0.00        11
          MC       0.83      0.91      0.87        11

    accuracy                           0.78        55
   macro avg       0.66      0.78      0.71        55
weighted avg       0.66      0.78      0.71        55


Classification Report for Fold 2:
              precision    recall  f1-score   support

          CC       1.00      1.00      1.00        11
          EC       0.31      0.45      0.37        11
        HGSC       0.41      1.00      0.58        11
        LGSC       1.00      0.09      0.17        11
          MC       0.00      0.00      0.00        11

    accuracy                           0.51        55
   macro avg       0.54      0.51      0.42        55
weighted

## Average of cross-validations

In [2]:
# Re-import necessary library after reset
import pandas as pd

# Prepare data for DataFrame
data = {
    "Class": ["CC", "EC", "HGSC", "LGSC", "MC", "Macro Avg", "Weighted Avg"],
    "Precision": [0.85, 0.76, 0.70, 0.20, 0.65, 0.63, 0.63],
    "Recall": [0.93, 0.87, 1.00, 0.02, 0.60, 0.68, 0.68],
    "F1-Score": [0.86, 0.81, 0.80, 0.03, 0.56, 0.61, 0.61],
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the DataFrame as a table
df

Unnamed: 0,Class,Precision,Recall,F1-Score
0,CC,0.85,0.93,0.86
1,EC,0.76,0.87,0.81
2,HGSC,0.7,1.0,0.8
3,LGSC,0.2,0.02,0.03
4,MC,0.65,0.6,0.56
5,Macro Avg,0.63,0.68,0.61
6,Weighted Avg,0.63,0.68,0.61


---------
## WHAT TO DO?

#### 1. Use a different ImageDataGenerator for `LGSC` class
#### 2. Use Ensemble of Methods 