In [1]:
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
import numpy as np
import pickle
from torch.utils import data
import torch
import random
import matplotlib.pyplot as plt
from os import path

In [2]:
''' for resizing images
import cv2

def resize_and_save_images(loc, original_dir, saving_dir, height = 160, width = 160):
    data = pd.read_csv(loc)
    for i in range(len(data)):
        image_filename = f'{data.iloc[i].id_code.strip()}.png'
        file_loc = original_dir + image_filename
        saving_loc = saving_dir + image_filename
        if not path.exists(saving_loc):
            try:
                img = cv2.imread(file_loc)
                img = cv2.resize(img, (width, height))
                cv2.imwrite(saving_loc, img)
            except:
                unsaved.append(f'{image_filename}')
                # print(f'{image_filename}')
                
resize_and_save_images('/u/home/r/rosemary/scratch/train.csv', 
                       '/u/home/r/rosemary/scratch/train_images_resized/', 
                       '/u/home/r/rosemary/scratch/train_images_smol/')
'''

" for resizing images\nimport cv2\n\ndef resize_and_save_images(loc, original_dir, saving_dir, height = 160, width = 160):\n    data = pd.read_csv(loc)\n    for i in range(len(data)):\n        image_filename = f'{data.iloc[i].id_code.strip()}.png'\n        file_loc = original_dir + image_filename\n        saving_loc = saving_dir + image_filename\n        if not path.exists(saving_loc):\n            try:\n                img = cv2.imread(file_loc)\n                img = cv2.resize(img, (width, height))\n                cv2.imwrite(saving_loc, img)\n            except:\n                unsaved.append(f'{image_filename}')\n                # print(f'{image_filename}')\n                \nresize_and_save_images('/u/home/r/rosemary/scratch/train.csv', \n                       '/u/home/r/rosemary/scratch/train_images_resized/', \n                       '/u/home/r/rosemary/scratch/train_images_smol/')\n"

In [3]:
def get_image_label(data, dir, indices):

    # just looking at 32 examples for testing purposes
    X = np.array([np.asarray(Image.open(f'{dir}{data.iloc[i].id_code}.png')) for i in indices])
    y = np.array([int(data.iloc[i].diagnosis) for i in indices])

    return X,y

def get_dataset(path, dir, train_size = 0.8, valid_size = 0.1):

    data = pd.read_csv(path)
    arr = [i for i in range(len(data))]
    random.shuffle(arr)
    train_indices = arr[: int(train_size * len(data)) ]
    train_X, train_y = get_image_label(data, dir, train_indices)
    train_X = train_X / 255.0

    valid_indices = arr[int(train_size * len(data)) : int((train_size + valid_size) * len(data))]
    valid_X, valid_y = get_image_label(data, dir, valid_indices)
    valid_X = valid_X / 255.0

    test_indices = arr[int((train_size + valid_size) * len(data)) : ]
    test_X, test_y = get_image_label(data, dir, test_indices)
    test_X = test_X / 255.0

    return train_X, train_y, valid_X, valid_y, test_X, test_y


In [4]:
train_X, train_y, valid_X, valid_y, test_X, test_y = get_dataset("/u/home/r/rosemary/scratch/train.csv", 
                                                                 '/u/home/r/rosemary/scratch/train_images_smol/')
train_X.shape, train_y.shape, valid_X.shape, valid_y.shape, test_X.shape, test_y.shape

((2929, 160, 160, 3),
 (2929,),
 (366, 160, 160, 3),
 (366,),
 (367, 160, 160, 3),
 (367,))

In [5]:
class DataGenerator(Dataset):
    def __init__(self, X, y):
        super(DataGenerator, self).__init__()
        self.X = np.transpose(X, (0, 3, 1, 2))
        self.y = y
        self.length = len(X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.length

In [6]:
bsz = 128

train_dataset = DataGenerator(train_X, train_y)
train_loader = data.DataLoader(train_dataset, batch_size= bsz, shuffle = True)

valid_dataset = DataGenerator(valid_X, valid_y)
valid_loader = data.DataLoader(valid_dataset, batch_size = bsz, shuffle = True)

test_dataset = DataGenerator(test_X, test_y)
test_loader = data.DataLoader(test_dataset, batch_size = bsz, shuffle = False)

In [7]:
class generate_model(torch.nn.Module):

    def __init__(self, base_model, hidden = 128, num_outs = 5):
        super(generate_model, self).__init__()

        # create a dummy input
        dummy_input = torch.rand(1, 3, 320, 320)
        out = base_model(dummy_input.to(device).float())
        input_size = out.shape[1]

        self.base_model = base_model
        self.fc = torch.nn.Sequential(
                                torch.nn.Linear(input_size, hidden), 
                                torch.nn.ReLU(),
                                torch.nn.Linear(hidden, num_outs)
                                )

    def forward(self, x):
        x = self.base_model(x)
        pred = self.fc(x)
        return pred

In [8]:
import torchvision.models as models

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [9]:
print(device)

cpu


In [14]:
basemodel = models.alexnet().to(device)
model = generate_model(base_model = basemodel).to(device)

model_file = "alexnet.pt"

lr = 1e-4

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr)

criterion = torch.nn.CrossEntropyLoss()

In [10]:
# from torchsummary import summary
# from pytorch_model_summary import summary

# summary(model, torch.zeros(1,3, 160, 160).to(device).float(), show_input=True)

In [11]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from scipy.special import softmax

def evaluate(model, objective, loader):

    model.eval()
    total_loss = 0
    size = 0
    
    with torch.no_grad():
        for batch_idx, data_batch in enumerate(loader):

            X, y = data_batch[0].to(device).float(), data_batch[1].to(device)
            # X, y = map(lambda t: t.to(device).float(), (X, y))

            prediction = model(X)
            total_loss += objective(prediction, y) * X.shape[0]
            
            size += X.shape[0]

    total_loss = total_loss / size

    return total_loss

def train(model, objective, optimizer, train_loader, valid_loader, epochs = 1, save_interval = 1, patience = 3):
  
    model.train()

    val_loss = 1e7
    pat = patience

    for epoch in range(1, epochs + 1):
        train_loss = 0
        size = 0

        for batch_idx, data_batch in enumerate(train_loader):

            optimizer.zero_grad()

            train_X, train_y = data_batch[0].to(device).float(), data_batch[1].to(device)
            # train_X, train_y = map(lambda t: t.to(device).float(), (train_X, train_y))

            prediction = model(train_X)
            loss = objective(prediction, train_y)
            loss.backward()

            train_loss += loss.item() * train_X.shape[0]
            size += train_X.shape[0]

            optimizer.step()

        avg_loss = train_loss / size
        
        rt_val_loss = evaluate(model, objective, valid_loader)
        model.train()

        print(f'Epoch {epoch}: Training Loss : {avg_loss} | Validation loss : {rt_val_loss}')

        if rt_val_loss < val_loss:
            val_loss = rt_val_loss
            torch.save(model.state_dict(), model_file)
            pat = patience
        else:
            pat = pat - 1
            if pat == 0:
                print('Training Complete --> Exiting')
                break    

In [11]:
train(model, criterion, optimizer, train_loader, valid_loader, epochs = 50)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/u/home/r/rosemary/.conda/envs/myenv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-0c06bbbea35f>", line 1, in <module>
    train(model, criterion, optimizer, train_loader, valid_loader, epochs = 50)
  File "<ipython-input-11-fee72951a315>", line 43, in train
    prediction = model(train_X)
  File "/u/home/r/rosemary/.conda/envs/myenv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<ipython-input-6-dfd9f70b3b0e>", line 19, in forward
    x = self.base_model(x)
  File "/u/home/r/rosemary/.conda/envs/myenv/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/u/home/r/rosemary/.conda/envs/myenv/lib/python3.6/site-packages/torchvision/models/vgg.py", line 49, in forwa

TypeError: object of type 'NoneType' has no len()

In [12]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer

def class_report(y_true, y_pred):
    if y_true.shape != y_pred.shape:
        print("Error! y_true %s is not the same shape as y_pred %s" % (
              y_true.shape,
              y_pred.shape)
        )
        return

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(
        y_pred,
        return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)

    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=labels)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=labels)

    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred'] = pred_cnt
    class_report_df['pred'].iloc[-1] = total

    return class_report_df

def test_evaluate(model, objective, loader):
    model.eval()
    total_loss = 0
    size = 0
    
    y_prob = []
    y_pred = []
    y_true = []
    
    with torch.no_grad():
        for batch_idx, data_batch in enumerate(loader):

            X, y = data_batch[0].to(device).float(), data_batch[1].to(device)
            # X, y = map(lambda t: t.to(device).float(), (X, y))

            prediction = model(X)
            total_loss += objective(prediction, y) * X.shape[0]
            
            pred = softmax(prediction)            
            pred = np.array([np.argmax(x) for x in pred])
            
            y_pred = np.append(y_pred, pred)
            y_true = np.append(y_true, y.numpy())

            size += X.shape[0]
    
    report = class_report(y_true, y_pred)
    print(report)

    total_loss = total_loss / size
    return total_loss

In [16]:
# model.load_state_dict(torch.load('densenet161.pt',map_location=torch.device('cpu')))
model.load_state_dict(torch.load('alexnet.pt',map_location=torch.device('cpu')))
print('alexnet')
test_evaluate(model, criterion, test_loader)

alexnet
             precision    recall  f1-score  support   pred
0.0           0.919192  0.968085  0.943005    188.0  198.0
1.0           0.408163  0.588235  0.481928     34.0   49.0
2.0           0.588235  0.777778  0.669856     90.0  119.0
3.0           1.000000  0.038462  0.074074     26.0    1.0
avg / total   0.723779  0.743869  0.697230    338.0  338.0


  _warn_prf(average, modifier, msg_start, len(result))


tensor(0.7315)