In [1]:
%pwd

'/content'

In [2]:
%cd drive/MyDrive/Fall2021/M226/Project/

/content/drive/MyDrive/Fall2021/M226/Project


In [3]:
%ls

datapipeline_skeleton.ipynb  models.ipynb  train.csv
filename.pickle              test.csv      [0m[01;34mtrain_images[0m/
model_resnet18.pt            [01;34mtest_images[0m/  [01;34mtrain_images_resized[0m/


In [4]:
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
import numpy as np
import pickle
from torch.utils import data
import torch
import random

In [5]:
import cv2
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow

In [6]:
from tqdm import tqdm
from os import path

# Function to resize images to (320, 320)
# **No need to run it now**

In [None]:
def resize_and_save_images(loc, original_dir, saving_dir, height = 320, width = 320):
  data = pd.read_csv(loc)
  for i in tqdm(range(len(data))):
    image_filename = f'{data.iloc[i].id_code.strip()}.png'
    file_loc = original_dir + image_filename
    saving_loc = saving_dir + image_filename
    if not path.exists(saving_loc):
      try:
        img = cv2.imread(file_loc)
        img = cv2.resize(img, (width, height))
        cv2.imwrite(saving_loc, img)
      except:
        unsaved.append(f'{image_filename}')
        # print(f'{image_filename}')

In [None]:
resize_and_save_images('train.csv', 'train_images/', 'train_images_resized/')

100%|██████████| 3662/3662 [00:19<00:00, 188.53it/s]


# Data pipeline creation
### Creating train, valid, test split along with the dataloader 

In [7]:
def get_image_label(data, dir, indices):

  # just looking at 32 examples for testing purposes
  X = np.array([np.asarray(Image.open(f'{dir}{data.iloc[i].id_code}.png')) for i in indices[:32]])
  y = np.array([int(data.iloc[i].diagnosis) for i in indices[:32]])

  return X,y

In [8]:
def get_dataset(path, dir, train_size = 0.8, valid_size = 0.1):

  data = pd.read_csv(path)
  arr = [i for i in range(len(data))]
  random.shuffle(arr)
  train_indices = arr[: int(train_size * len(data)) ]
  train_X, train_y = get_image_label(data, dir, train_indices)
  train_X = train_X / 255.0

  valid_indices = arr[int(train_size * len(data)) : int((train_size + valid_size) * len(data))]
  valid_X, valid_y = get_image_label(data, dir, valid_indices)
  valid_X = valid_X / 255.0

  test_indices = arr[int((train_size + valid_size) * len(data)) : ]
  test_X, test_y = get_image_label(data, dir, test_indices)
  test_X = test_X / 255.0

  return train_X, train_y, valid_X, valid_y, test_X, test_y

# Set path to train.csv and train_images_resized folder

In [9]:
train_X, train_y, valid_X, valid_y, test_X, test_y = get_dataset("train.csv", 'train_images_resized/')

In [10]:
train_X.shape, train_y.shape, valid_X.shape, valid_y.shape, test_X.shape, test_y.shape

((32, 320, 320, 3), (32,), (32, 320, 320, 3), (32,), (32, 320, 320, 3), (32,))

In [11]:
class DataGenerator(Dataset):
  def __init__(self, X, y):
    super(DataGenerator, self).__init__()
    self.X = np.transpose(X, (0, 3, 1, 2))
    self.y = y
    self.length = len(X)
  
  def __getitem__(self, index):
    return self.X[index], self.y[index]

  def __len__(self):
    return self.length

In [12]:
bsz = 32

train_dataset = DataGenerator(train_X, train_y)
train_loader = data.DataLoader(train_dataset, batch_size= bsz, shuffle = True)

valid_dataset = DataGenerator(valid_X, valid_y)
valid_loader = data.DataLoader(valid_dataset, batch_size = bsz, shuffle = True)

test_dataset = DataGenerator(test_X, test_y)
test_loader = data.DataLoader(test_dataset, batch_size = bsz, shuffle = False)

## Model Generation

In [13]:
class generate_model(torch.nn.Module):

  def __init__(self, base_model, hidden = 128, num_outs = 5):
    super(generate_model, self).__init__()

    # create a dummy input
    dummy_input = torch.rand(1, 3, 320, 320)
    out = base_model(dummy_input.to(device).float())
    input_size = out.shape[1]

    self.base_model = base_model
    self.fc = torch.nn.Sequential(
                            torch.nn.Linear(input_size, hidden), 
                            torch.nn.ReLU(),
                            torch.nn.Linear(hidden, num_outs)
                            )
    
  def forward(self, x):
    x = self.base_model(x)
    pred = self.fc(x)
    return pred

In [14]:
import torchvision.models as models

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [15]:
basemodel = models.resnet18().to(device)
model = generate_model(base_model = basemodel).to(device)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


# To get an idea of number of trainable params

In [None]:
from torchsummary import summary
summary(model, (3, 320, 320))

# Training Pipeline

In [16]:
model_file = "model_resnet18.pt"

lr = 1e-4

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr)

criterion = torch.nn.CrossEntropyLoss()

In [17]:
def evaluate(model, objective, loader):

  model.eval()
  total_loss = 0
  size = 0
  for batch_idx, data_batch in enumerate(loader):

    X, y = data_batch[0].to(device).float(), data_batch[1].to(device)
    # X, y = map(lambda t: t.to(device).float(), (X, y))

    prediction = model(X)
    total_loss += objective(prediction, y) * X.shape[0]
    size += X.shape[0]

  total_loss = total_loss / size

  return total_loss

In [18]:
def train(model, objective, optimizer, train_loader, valid_loader, epochs = 1, save_interval = 1, patience = 3):
  
  model.train()

  val_loss = 1e7
  pat = patience

  for epoch in range(1, epochs + 1):
    train_loss = 0
    size = 0

    for batch_idx, data_batch in enumerate(train_loader):

      optimizer.zero_grad()

      train_X, train_y = data_batch[0].to(device).float(), data_batch[1].to(device)
      # train_X, train_y = map(lambda t: t.to(device).float(), (train_X, train_y))

      prediction = model(train_X)
      loss = objective(prediction, train_y)
      loss.backward()

      train_loss += loss.item() * train_X.shape[0]
      size += train_X.shape[0]

      optimizer.step()

    avg_loss = train_loss / size
      
    rt_val_loss = evaluate(model, objective, valid_loader)
    model.train()

    print(f'Epoch {epoch}: Training Loss : {avg_loss} | Validation loss : {rt_val_loss}')

    if rt_val_loss < val_loss:
      val_loss = rt_val_loss
      torch.save(model.state_dict(), model_file)
      pat = patience
    else:
      pat = pat - 1
      if pat == 0:
        print('Training Complete --> Exiting')
        break    

In [20]:
train(model, criterion, optimizer, train_loader, valid_loader, epochs = 10)

Epoch 1: Training Loss : 1.6469745635986328 | Validation loss : 1.5910922288894653
Epoch 2: Training Loss : 1.4696815013885498 | Validation loss : 1.574928641319275
Epoch 3: Training Loss : 1.3427343368530273 | Validation loss : 1.5636005401611328
Epoch 4: Training Loss : 1.2360069751739502 | Validation loss : 1.5594807863235474
Epoch 5: Training Loss : 1.1266865730285645 | Validation loss : 1.5683856010437012
Epoch 6: Training Loss : 1.0135369300842285 | Validation loss : 1.5949983596801758
Epoch 7: Training Loss : 0.9001538753509521 | Validation loss : 1.643510341644287
Training Complete --> Exiting
