In [4]:
%pwd

'/content'

In [5]:
%cd drive/MyDrive/Fall2021/M226/Project/

/content/drive/MyDrive/Fall2021/M226/Project


In [6]:
%ls

CS226.ipynb      models.ipynb  [0m[01;34mtest_images[0m/  [01;34mtrain_images[0m/
filename.pickle  test.csv      train.csv     [01;34mtrain_images_resized[0m/


In [7]:
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
import numpy as np
import pickle
from torch.utils import data

In [8]:
import cv2
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow

In [9]:
from tqdm import tqdm
from os import path

# Function to resize images to (320, 320)

In [None]:
def resize_and_save_images(loc, original_dir, saving_dir, height = 320, width = 320):
  data = pd.read_csv(loc)
  for i in tqdm(range(len(data))):
    image_filename = f'{data.iloc[i].id_code.strip()}.png'
    file_loc = original_dir + image_filename
    saving_loc = saving_dir + image_filename
    if not path.exists(saving_loc):
      try:
        img = cv2.imread(file_loc)
        img = cv2.resize(img, (width, height))
        cv2.imwrite(saving_loc, img)
      except:
        unsaved.append(f'{image_filename}')
        # print(f'{image_filename}')

In [None]:
resize_and_save_images('train.csv', 'train_images/', 'train_images_resized/')

100%|██████████| 3662/3662 [00:19<00:00, 188.53it/s]


In [1]:
import random

# Data pipeline creation
### Creating train, valid, test split along with the dataloader 

In [2]:
def get_image_label(data, dir, indices):

  X = np.array([np.asarray(Image.open(f'{dir}{data.iloc[i].id_code}.png')) for i in indices])
  y = np.array([int(data.iloc[i].diagnosis) for i in indices])

  return X,y

In [3]:
def get_dataset(path, dir, train_size = 0.8, valid_size = 0.1):

  data = pd.read_csv(path)
  arr = [i for i in range(len(data))]
  random.shuffle(arr)
  train_indices = arr[: int(train_size * len(data)) ]
  train_X, train_y = get_image_label(data, dir, train_indices)

  valid_indices = arr[int(train_size * len(data)) : int((train_size + valid_size) * len(data))]
  valid_X, valid_y = get_image_label(data, dir, valid_indices)

  test_indices = arr[int((train_size + valid_size) * len(data)) : ]
  test_X, test_y = get_image_label(data, dir, test_indices)


  return train_X, train_y, valid_X, valid_y, test_X, test_y

# Set path to train.csv and train_images_resized folder

In [None]:
train_X, train_y, valid_X, valid_y, test_X, test_y = get_dataset("train.csv", 'train_images_resized/')

In [12]:
train_X.shape, train_y.shape, valid_X.shape, valid_y.shape, test_X.shape, test_y.shape

((10, 320, 320, 3), (10,), (10, 320, 320, 3), (10,), (10, 320, 320, 3), (10,))

In [14]:
class DataGenerator(Dataset):
  def __init__(self, X, y):
    super(DataGenerator, self).__init__()
    self.X = X
    self.y = y
    self.length = len(X)
  
  def __getitem__(self, index):
    return self.X[index], self.y[index]

  def __len__(self):
    return self.length

In [16]:
train_dataset = DataGenerator(train_X, train_y)
train_loader = data.DataLoader(train_dataset, batch_size= 5, shuffle = True)

valid_dataset = DataGenerator(valid_X, valid_y)
valid_loader = data.DataLoader(valid_dataset, batch_size = 5, shuffle = True)

test_dataset = DataGenerator(test_X, test_y)
test_loader = data.DataLoader(test_dataset, batch_size = 5, shuffle = True)

In [20]:
for batch, data_batch in enumerate(train_loader):
  print(batch)
  print(data_batch[0].shape)
  print(data_batch[1].shape)

0
torch.Size([5, 320, 320, 3])
torch.Size([5])
1
torch.Size([5, 320, 320, 3])
torch.Size([5])


## Training Skeleton

In [None]:
def train(model, optimizer, train_loader, valid_loader, epochs = 1, save_interval = 1):
  model.train()
  total_loss = 0
  for epoch in range(1, epochs + 1):
    train_loss = 0
    for batch_idx, data_batch in enumerate(loader):
      optimizer.zero_grad()
      train_X, train_y = data_batch[0], data_batch[1]
      prediction = model(train_X)
      loss = __LOSS__(prediction, train_y)
      loss.backward()
      train_loss += loss.item()
      optimizer.step()
    avg_loss = train_loss / len(train_loader.dataset)
      
    if batch_idx % save_interval == 0:
      val_loss = __EVALUATE__(model, valid_loader)
    