# Model Development

In [None]:
import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from PIL import Image
from pathlib import Path
from torchvision import transforms, models
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print("no gpu available! Using cpu instead.")
    device = torch.device("cpu")

In [None]:
class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, df, root_dir, transform=None):
        self.df = df
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = Path(self.root_dir, self.df.iloc[idx, 0])
        image = Image.open(img_path)
        label = int(self.df.iloc[idx, 1])

        if self.transform:
            image = self.transform(image)

        return image, label

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
csv_fp = '/data/pathology/users/clement/diag_day/data/dev.csv'
df = pd.read_csv(csv_fp)
df.head()

In [None]:
train_df, tune_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
len(train_df), len(tune_df)

**<span style="color:red;">specify here the path to the folder where the development set images are</span>**

In [None]:
image_dir = '/data/pathology/users/clement/diag_day/data/dev'

In [None]:
train_dataset = ImageDataset(df=train_df, root_dir=image_dir, transform=transform)
tune_dataset = ImageDataset(df=tune_df, root_dir=image_dir, transform=transform)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
tune_loader = torch.utils.data.DataLoader(tune_dataset, batch_size=32, shuffle=False)

In [None]:
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 2)  # Modifying for binary classification
model = model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_model(model, train_loader, tune_loader, criterion, optimizer, nepochs=10, tune_every_n_epochs=2):

    best_tune_loss = float('inf')
    best_model_wts = None

    with tqdm.notebook.tqdm(
        range(nepochs),
        desc="Model training",
        unit=" epoch",
        position=0,
        leave=True
    ) as t:

        for epoch in t:

            model.train()
            running_loss = 0.0
            with tqdm.notebook.tqdm(
                train_loader,
                desc=f"Train - Epoch [{epoch+1}/{nepochs}]",
                unit=" batch",
                leave=False,
            ) as train_t:

                for inputs, labels in train_t:

                    inputs, labels = inputs.to(device), labels.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                    running_loss += loss.item()

                tqdm.tqdm.write(f"Epoch [{epoch+1}/{nepochs}] | Training Loss: {running_loss/len(train_loader)}")

            # tuning step every n epochs
            if (epoch + 1) % tune_every_n_epochs == 0:

                model.eval()
                tune_loss = 0.0
                all_labels = []
                all_predictions = []

                with torch.no_grad():

                    with tqdm.notebook.tqdm(
                        tune_loader,
                        desc=f"Tune",
                        unit=" batch",
                        leave=False,
                    ) as tune_t:

                        for inputs, labels in tune_t:

                            inputs, labels = inputs.to(device), labels.to(device)
                            outputs = model(inputs)
                            loss = criterion(outputs, labels)
                            tune_loss += loss.item()
                            
                            _, preds = torch.max(outputs, 1)
                            all_labels.extend(labels.cpu().numpy())
                            all_predictions.extend(preds.cpu().numpy())

                average_tune_loss = tune_loss / len(tune_loader)
                auc_score = roc_auc_score(all_labels, all_predictions)
                tqdm.tqdm.write(f"Epoch [{epoch+1}/{nepochs}] | Tuning Loss after: {average_tune_loss} | AUC: {auc_score}")

                # get best model weights
                if average_tune_loss < best_tune_loss:
                    best_tune_loss = average_tune_loss
                    best_model_wts = model.state_dict().copy()

    print("Finished training")
    return best_model_wts

In [None]:
best_weights = train_model(model, train_loader, tune_loader, criterion, optimizer, nepochs=1, tune_every_n_epochs=1)
torch.save(best_weights, 'best.pt')