In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm

In [None]:
!rm tredence_chest_dataset.zip
!wget https://he-public-data.s3.ap-southeast-1.amazonaws.com/tredence_chest_dataset.zip

In [None]:
!rm -rf dataset
!rm -rf prepped
!unzip tredence_chest_dataset.zip

In [None]:
!ls dataset/train

In [None]:
import os
import csv

def mk_if_not(path):
    try:
        os.system('rm -rf '+ path)
        os.mkdir(path)
    except:
        pass

mk_if_not('prepped')
mk_if_not('prepped/train')
mk_if_not('prepped/train/nofinding')
mk_if_not('prepped/train/cardiomegaly')
mk_if_not('prepped/val')
mk_if_not('prepped/val/nofinding')
mk_if_not('prepped/val/cardiomegaly')
mk_if_not('prepped/test')
mk_if_not('prepped/test/nofinding')
mk_if_not('prepped/test/cardiomegaly')

def get_label(path):
    data = {}
    with open(path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                pass
                line_count += 1
            else:
                data[row[0]] = "".join(row[1].split()).lower()
                line_count += 1
        assert line_count == len(data) + 1
    return data

def split_and_dump(split):
    if split == "train":
        label = list(get_label('dataset/train.csv').items())
        split = (6 * len(label)) // 7
        train, val = label[:split], label[split:]
        print(set([x[1] for x in label]), len(train), len(val))
        for sp, f in [('train', train), ('val', val)]:
            for k, v in f:
                os.system("cp dataset/train/" + k + " prepped/" + sp + "/" + v + "/"+ k)
    else:
        assert split == "test"
        test = label = list(get_label('dataset/test.csv').items())
        print(set([x[1] for x in label]), len(test))
        for k, _ in test:
            if int(k.split('_')[1].split('.')[0]) < 2:
                os.system("cp dataset/test/" + k + " prepped/test/nofinding/" + k)
            else:
                os.system("cp dataset/test/" + k + " prepped/test/cardiomegaly/" + k)

split_and_dump("train")
split_and_dump("test")

In [None]:
import os
for x in ['test', 'train', 'val']:
    print(x + '/cardiomegaly:', len(os.listdir('prepped/' + x.strip() + '/cardiomegaly')))
    print(x + '/nofinding   :', len(os.listdir('prepped/' + x.strip() + '/nofinding')))

In [None]:
import matplotlib.pyplot as plt 
import torch.nn.functional as F 
import torch 
import numpy as np 

def show_image(image,label,get_denormalize = True):
    
    image = image.permute(1,2,0)
    mean = torch.FloatTensor([0.485, 0.456, 0.406])
    std = torch.FloatTensor([0.229, 0.224, 0.225])
    
    if get_denormalize == True:
        image = image*std + mean
        image = np.clip(image,0,1)
        plt.imshow(image)
        plt.title(label)
        
    else: 
        plt.imshow(image)
        plt.title(label)

def show_grid(image,title = None):
    
    image = image.permute(1,2,0)
    mean = torch.FloatTensor([0.485, 0.456, 0.406])
    std = torch.FloatTensor([0.229, 0.224, 0.225])
    
    image = image*std + mean
    image = np.clip(image,0,1)
    
    plt.figure(figsize=[15, 15])
    plt.imshow(image)
    if title != None:
        plt.title(title)


def accuracy(y_pred,y_true):
    y_pred = F.softmax(y_pred,dim = 1)
    top_p,top_class = y_pred.topk(1,dim = 1)
    equals = top_class == y_true.view(*top_class.shape)
    return torch.mean(equals.type(torch.FloatTensor))


def view_classify(image,ps,label):
    
    class_name = ['cardiomegaly', 'nofinding']
    classes = np.array(class_name)

    ps = ps.cpu().data.numpy().squeeze()
    
    image = image.permute(1,2,0)
    mean = torch.FloatTensor([0.485, 0.456, 0.406])
    std = torch.FloatTensor([0.229, 0.224, 0.225])
    
    
    image = image*std + mean
    img = np.clip(image,0,1)
    
    fig, (ax1, ax2) = plt.subplots(figsize=(8,12), ncols=2)
    ax1.imshow(img)
    ax1.set_title('Ground Truth : {}'.format(class_name[label]))
    ax1.axis('off')
    ax2.barh(classes, ps)
    ax2.set_aspect(0.1)
    ax2.set_yticks(classes)
    ax2.set_yticklabels(classes)
    ax2.set_title('Predicted Class')
    ax2.set_xlim(0, 1.1)

    plt.tight_layout()

    return None

In [None]:
class CFG:

  epochs = 5                              # No. of epochs for training the model
  lr = 0.0005                              # Learning rate
  batch_size = 16                         # Batch Size for Dataset

  model_name = 'tf_efficientnet_b4_ns'    # Model name (we are going to import model from timm)
  img_size = 224                          # Resize all the images to be 224 by 224

  # going to be used for loading dataset
  train_path    = '/content/prepped/train'
  validate_path = '/content/prepped/val'
  test_path     = '/content/prepped/test'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("On which device we are on:{}".format(device))

In [None]:
from torchvision import transforms as T,datasets

In [None]:
train_transform = T.Compose([
                             
                             T.Resize(size=(CFG.img_size,CFG.img_size)), # Resizing the image to be 224 by 224
                             T.RandomRotation(degrees=(-20,+20)), #Randomly Rotate Images by +/- 20 degrees, Image argumentation for each epoch
                             T.ToTensor(), #converting the dimension from (height,weight,channel) to (channel,height,weight) convention of PyTorch
                             T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]) # Normalize by 3 means 3 StD's of the image net, 3 channels

])

validate_transform = T.Compose([
                             
                             T.Resize(size=(CFG.img_size,CFG.img_size)), # Resizing the image to be 224 by 224
                             #T.RandomRotation(degrees=(-20,+20)), #NO need for validation
                             T.ToTensor(), #converting the dimension from (height,weight,channel) to (channel,height,weight) convention of PyTorch
                             T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]) # Normalize by 3 means 3 StD's of the image net, 3 channels

])

test_transform = T.Compose([
                             
                             T.Resize(size=(CFG.img_size,CFG.img_size)), # Resizing the image to be 224 by 224
                             #T.RandomRotation(degrees=(-20,+20)), #NO need for validation
                             T.ToTensor(), #converting the dimension from (height,weight,channel) to (channel,height,weight) convention of PyTorch
                             T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]) # Normalize by 3 means 3 StD's of the image net, 3 channels

])

In [None]:
trainset=datasets.ImageFolder(CFG.train_path,transform=train_transform)
print("Trainset Size:  {}".format(len(trainset)))

In [None]:
validateset=datasets.ImageFolder(CFG.validate_path,transform=validate_transform)
print("validateset Size:  {}".format(len(validateset)))

In [None]:
testset=datasets.ImageFolder(CFG.test_path,transform=test_transform)
print("testset Size:  {}".format(len(testset)))

In [None]:
img,label = trainset[10]
#print(trainset.class_to_idx)

class_name =["cardiomegaly","nofinding"]
show_image(img,class_name[label])

In [None]:
from torch.utils.data import DataLoader
from torchvision.utils import make_grid

In [None]:
trainloader = DataLoader(trainset,batch_size=CFG.batch_size,shuffle=True)
print("No. of batches in trainloader:{}".format(len(trainloader))) #Trainset Size:  1400 / batch_size: 16 = 88(No. of batches in trainloader) 
print("No. of Total examples:{}".format(len(trainloader.dataset)))

In [None]:
validationloader = DataLoader(validateset,batch_size=CFG.batch_size,shuffle=True)
print("No. of batches in validationloader:{}".format(len(validationloader))) #validationset Size:  16 / batch_size: 16 = 1(No. of batches in validationloader) 
print("No. of Total examples:{}".format(len(validationloader.dataset)))

In [None]:
testloader = DataLoader(testset,batch_size=CFG.batch_size,shuffle=False)
print("No. of batches in testloader:{}".format(len(testloader))) #testset Size:  624 / batch_size: 16 = 39(No. of batches in testloader) 
print("No. of Total examples:{}".format(len(testloader.dataset)))

In [None]:
dataiter = iter(trainloader)
images,labels = dataiter.next()

out = make_grid(images,nrow=4)

show_grid(out,title = [class_name[x] for x in labels])

In [None]:
!pip install timm # install PyTorch Image Models

In [None]:
from torch import nn
import torch.nn.functional as F
import timm # PyTorch Image Models

model = timm.create_model(CFG.model_name,pretrained=True) #load pretrained model

In [None]:
!rm ColabPneumoniaModel.pt
!wget https://github.com/Ayushk4/semStance/releases/download/tagg/ColabPneumoniaModel.pt

In [None]:
prev_state_dict = torch.load('ColabPneumoniaModel.pt')
# prev_state_dict['classifier.0.weight'].shape for i in ['0','3','5']
del prev_state_dict['classifier.0.weight']
del prev_state_dict['classifier.0.bias']
del prev_state_dict['classifier.3.weight']
del prev_state_dict['classifier.3.bias']
del prev_state_dict['classifier.5.weight']
del prev_state_dict['classifier.5.bias']
prev_state_dict['classifier.weight'] = torch.randn(model.classifier.weight.shape)
prev_state_dict['classifier.bias'] = torch.randn(model.classifier.bias.shape)
# print(type(prev_state_dict))

# print(model.classifier.weight.shape)
model.load_state_dict(prev_state_dict)

In [None]:
#let's update the pretarined model:
for param in model.parameters():
  param.requires_grad=False

#orginally, it was:
#(classifier): Linear(in_features=1792, out_features=1000, bias=True)


#we are updating it as a 2-class classifier:
model.classifier = nn.Sequential(
    nn.Linear(in_features=1792, out_features=625), #1792 is the orginal in_features
    nn.ReLU(), #ReLu to be the activation function
    nn.Dropout(p=0.3),
    nn.Linear(in_features=625, out_features=256),
    nn.ReLU(),
    nn.Linear(in_features=256, out_features=2), 
)

model
# print()
# after updatingnow it becomes:
#(classifier): Sequential(
#    (0): Linear(in_features=1792, out_features=625, bias=True)
#    (1): ReLU()
#    (2): Dropout(p=0.3, inplace=False)
#    (3): Linear(in_features=625, out_features=256, bias=True)
#    (4): ReLU()
#    (5): Linear(in_features=256, out_features=2, bias=True)
#  )

In [None]:
from torchsummary import  summary
model.to(device) # move the model to GPU
summary(model,input_size=(3,224,224))

In [None]:
class MyTrainer():
    
    def __init__(self,criterion = None,optimizer = None,schedular = None):
        
        self.criterion = criterion
        self.optimizer = optimizer
        self.schedular = schedular
    
    def train_batch_loop(self,model,trainloader):
        
        train_loss = 0.0
        train_acc = 0.0
        
        for images,labels in tqdm(trainloader): 
            
            # move the data to CPU
            images = images.to(device)
            labels = labels.to(device)
            
            logits = model(images)
            loss = self.criterion(logits,labels)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            train_loss += loss.item()
            train_acc += accuracy(logits,labels)
            
        return train_loss / len(trainloader), train_acc / len(trainloader) 

    
    def valid_batch_loop(self,model,validloader):
        
        valid_loss = 0.0
        valid_acc = 0.0
        
        for images,labels in tqdm(validloader):
            
            # move the data to CPU
            images = images.to(device) 
            labels = labels.to(device)
            
            logits = model(images)
            loss = self.criterion(logits,labels)
            
            valid_loss += loss.item()
            valid_acc += accuracy(logits,labels)
            
        return valid_loss / len(validloader), valid_acc / len(validloader)
            
        
    def fit(self,model,trainloader,validloader,epochs):
        
        valid_min_loss = np.Inf 
        
        for i in range(epochs):
            
            model.train() # this turn on dropout
            avg_train_loss, avg_train_acc = self.train_batch_loop(model,trainloader) ###
            
            model.eval()  # this turns off the dropout lapyer and batch norm
            avg_valid_loss, avg_valid_acc = self.valid_batch_loop(model,validloader) ###
            
            if avg_valid_loss <= valid_min_loss :
                print("Valid_loss decreased {} --> {}".format(valid_min_loss,avg_valid_loss))
                torch.save(model.state_dict(),'saved.pt')
                valid_min_loss = avg_valid_loss

                
            print("Epoch : {} Train Loss : {:.6f} Train Acc : {:.6f}".format(i+1, avg_train_loss, avg_train_acc))
            print("Epoch : {} Valid Loss : {:.6f} Valid Acc : {:.6f}".format(i+1, avg_valid_loss, avg_valid_acc))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = CFG.lr)

trainer = MyTrainer(criterion,optimizer)
trainer.fit(model,trainloader,validationloader,epochs = CFG.epochs)

In [None]:
model.load_state_dict(torch.load('/content/saved.pt'))
model.eval()

# avg_test_loss, avg_test_acc = trainer.valid_batch_loop(model,testloader)


# print("Test Loss : {}".format(avg_test_loss))
# print("Test Acc : {}".format(avg_test_acc))

In [None]:
testset[0][1]

In [None]:
import torch.nn.functional as F

for i, _ in testset:
image,label = testset[15]

ps = model(image.to(device).unsqueeze(0))
ps = F.softmax(ps,dim = 1)
ps.argmax().detach().cpu()
# view_classify(image,ps,label)

In [None]:
import torch.nn.functional as F

image,label = testset[1]

ps = model(image.to(device).unsqueeze(0))
ps = F.softmax(ps,dim = 1)

view_classify(image,ps,label)

In [None]:
import torch.nn.functional as F

image,label = testset[14]

ps = model(image.to(device).unsqueeze(0))
ps = F.softmax(ps,dim = 1)

view_classify(image,ps,label)

In [None]:
import torch.nn.functional as F

mapper = {0:'Cardiomegaly', 1:'No Finding'}
strs = ["imageID,disease"]
for i in range(len(testset)):
    image,_ = testset[i]
    filename = testset.samples[i][0]

    ps = model(image.to(device).unsqueeze(0))
    ps = F.softmax(ps,dim = 1)
    strs.append(filename.split('/')[-1] + "," + mapper[ps.argmax().cpu().tolist()])

# view_classify(image,ps,label)

In [None]:
open("sample_submission.csv", 'w+').write("\n".join(strs).strip())

In [None]:
!ls

In [None]:
CFG.train_path

In [None]:
testset.samples