In [170]:
import numpy as np
import pandas as pd 
import cv2
import os
import tqdm
import glob

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms


class OralCancerDataset(Dataset):
    """__init__ and __len__ functions are the same as in TorchvisionDataset"""

    def __init__(self, path_to_images, path_to_csv = None, transform = None):
        
        # Passing the path to the train csv file reads the data from the csv with the labels
        # If None is passes insted only the images in the image folder is loaded (wich is useful for the test set)
        
        self.path_to_images = path_to_images
        self.path_to_csv = path_to_csv
        
        if self.path_to_csv is not None:
            self.df = pd.read_csv(self.path_to_csv)

        self.transform = transform
    
    def __len__(self):
        if self.path_to_csv:
            return len(self.df)
        else:
            return len(glob.glob(self.path_to_images + '/*.jpg'))
    
    def __getitem__(self, idx):
        
        if self.path_to_csv:
            data = self.df.iloc[idx]
            image = cv2.imread(os.path.join(self.path_to_images, data['Name']), -1)
            label = data['Diagnosis']
            
            # You can input torchvision (or other) transforms and directly augment the data
            if self.transform:
                image = self.transform(image)
            # ..
            
            return image, label
            
        else:
            name = 'image_' + str(idx) + '.jpg'
            image = cv2.imread(os.path.join(self.path_to_images, name), -1)
            
            return image, name

In [171]:
path_to_csv = './input/cancer-classification-challenge-2023/train.csv'
df = pd.read_csv(path_to_csv)
df.head()

Unnamed: 0,Name,Diagnosis
0,pat_009_ind_5214.jpg,1
1,pat_009_ind_5365.jpg,1
2,pat_009_ind_84.jpg,1
3,pat_009_ind_1799.jpg,1
4,pat_009_ind_7109.jpg,1


In [172]:
df['PatID'] = df['Name'].str[:7]
df.head()

Unnamed: 0,Name,Diagnosis,PatID
0,pat_009_ind_5214.jpg,1,pat_009
1,pat_009_ind_5365.jpg,1,pat_009
2,pat_009_ind_84.jpg,1,pat_009
3,pat_009_ind_1799.jpg,1,pat_009
4,pat_009_ind_7109.jpg,1,pat_009


In [173]:
df['PatID'].unique()

array(['pat_009', 'pat_025', 'pat_053', 'pat_063', 'pat_067', 'pat_071',
       'pat_077', 'pat_081', 'pat_086', 'pat_096'], dtype=object)

In [174]:
df.groupby(['PatID']).agg({'Name':'count'}).reset_index()

Unnamed: 0,PatID,Name
0,pat_009,8000
1,pat_025,7684
2,pat_053,5282
3,pat_063,8000
4,pat_067,8000
5,pat_071,8000
6,pat_077,8000
7,pat_081,8000
8,pat_086,8000
9,pat_096,4453


In [175]:
df1 = df.groupby(['PatID', 'Diagnosis']).agg({'Name':'count'}).reset_index()
df1

Unnamed: 0,PatID,Diagnosis,Name
0,pat_009,1,8000
1,pat_025,0,7684
2,pat_053,1,5282
3,pat_063,0,8000
4,pat_067,0,8000
5,pat_071,0,8000
6,pat_077,0,8000
7,pat_081,0,8000
8,pat_086,1,8000
9,pat_096,1,4453


In [176]:
df1[~df1['PatID'].str.contains("pat_025|pat_096")]

Unnamed: 0,PatID,Diagnosis,Name
0,pat_009,1,8000
2,pat_053,1,5282
3,pat_063,0,8000
4,pat_067,0,8000
5,pat_071,0,8000
6,pat_077,0,8000
7,pat_081,0,8000
8,pat_086,1,8000


In [177]:
val_df = df[df['Name'].str.contains("pat_025|pat_096")][['Name','Diagnosis']].copy()

path_to_valcsv = './input/cancer-classification-challenge-2023/val_label.csv'
val_df.to_csv(path_to_valcsv,index=False)

val_df.head()

Unnamed: 0,Name,Diagnosis
8000,pat_025_ind_5214.jpg,0
8001,pat_025_ind_5365.jpg,0
8002,pat_025_ind_84.jpg,0
8003,pat_025_ind_1799.jpg,0
8004,pat_025_ind_7109.jpg,0


In [178]:
train_df = df[~df['Name'].str.contains("pat_025|pat_096")][['Name','Diagnosis']].copy()

path_to_traincsv = './input/cancer-classification-challenge-2023/train_label.csv'
train_df.to_csv(path_to_traincsv,index=False)

train_df.head()

Unnamed: 0,Name,Diagnosis
0,pat_009_ind_5214.jpg,1
1,pat_009_ind_5365.jpg,1
2,pat_009_ind_84.jpg,1
3,pat_009_ind_1799.jpg,1
4,pat_009_ind_7109.jpg,1


In [179]:
val_df.groupby(['Diagnosis']).agg({'Name':'count'}).reset_index()

Unnamed: 0,Diagnosis,Name
0,0,7684
1,1,4453


In [180]:
train_df.groupby(['Diagnosis']).agg({'Name':'count'}).reset_index()

Unnamed: 0,Diagnosis,Name
0,0,40000
1,1,21282


In [220]:
transform = transforms.Compose([  
            transforms.ToPILImage(),          
            transforms.Resize(152),
            transforms.RandomCrop(128),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(
            brightness=0.4,
            contrast=0.4,
            saturation=0.4),
            transforms.ToTensor(),                   
            transforms.Normalize(                      
            mean=[0.485, 0.456, 0.406],                
            std=[0.229, 0.224, 0.225]                  
            )])

In [221]:
#path_to_csv = './input/cancer-classification-challenge-2023/train.csv'
path_to_train_images = './input/cancer-classification-challenge-2023/train'
path_to_val_images = './input/cancer-classification-challenge-2023/val'
path_to_test_images = './input/cancer-classification-challenge-2023/test'


#train_dataset = OralCancerDataset(path_to_train_images, path_to_csv)

train_dataset = OralCancerDataset(path_to_train_images, path_to_traincsv, transform = transform)
val_dataset = OralCancerDataset(path_to_val_images, path_to_valcsv, transform = transform)

test_dataset = OralCancerDataset(path_to_test_images, path_to_csv = None, transform = transform)

dataloaders = {
    'train': DataLoader(train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=0), #4 * (1+torch.cuda.device_count()))

    'val': DataLoader(val_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=0), #4 * (1+torch.cuda.device_count()))

    'test': DataLoader(test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0) #4 * (1+torch.cuda.device_count()))
}

In [222]:
print(len(dataloaders['train']))
print(len(dataloaders['val']))
print(len(dataloaders['test']))

1916
380
2080


In [223]:
for i in tqdm.tqdm(dataloaders['train']):
    print(i)
    break

  0%|          | 0/1916 [00:00<?, ?it/s]

[tensor([[[[ 2.0777,  2.0777,  2.0263,  ...,  1.3242,  1.3927,  1.3927],
          [ 1.9920,  1.9749,  1.8893,  ...,  1.2899,  1.3242,  1.3242],
          [ 1.8893,  1.8550,  1.7865,  ...,  1.2043,  1.2214,  1.2214],
          ...,
          [ 1.7009,  1.7009,  1.6838,  ...,  1.7694,  1.7009,  1.6667],
          [ 1.8037,  1.7865,  1.7865,  ...,  1.7865,  1.7865,  1.7865],
          [ 1.9407,  1.9407,  1.9064,  ...,  1.8893,  1.9064,  1.8893]],

         [[ 2.2010,  2.1660,  2.1134,  ...,  1.3256,  1.3782,  1.3782],
          [ 2.0784,  2.0609,  1.9734,  ...,  1.2731,  1.3256,  1.3256],
          [ 1.9734,  1.9384,  1.8683,  ...,  1.1856,  1.2206,  1.2206],
          ...,
          [ 1.8508,  1.8508,  1.8333,  ...,  1.7633,  1.7108,  1.6583],
          [ 1.9384,  1.9384,  1.9209,  ...,  1.7808,  1.7808,  1.7808],
          [ 2.0784,  2.0784,  2.0609,  ...,  1.8683,  1.9209,  1.8683]],

         [[ 2.2914,  2.2914,  2.2391,  ...,  1.2108,  1.2980,  1.2980],
          [ 2.2217,  2.1868, 




In [224]:
#device = torch.device("cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [233]:
#import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

### Resnet50
from torchvision import models

resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)

# Freeze training for all “features” layers
for name, param in resnet50.named_parameters():
    if("bn" not in name):
        param.requires_grad = False    

for param in resnet50.layer4.parameters():
    param.requires_grad = True

n_inputs = resnet50.fc.in_features
# add last linear layer (n_inputs -> output of resnet layer)
# new layers automatically have requires_grad = True
resnet50.fc = torch.nn.Sequential(
    torch.nn.Linear(n_inputs, 512),
    torch.nn.ReLU(inplace=True),
    #torch.nn.Dropout(inplace=True),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(inplace=True),
    #torch.nn.Dropout(inplace=True),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(inplace=True),
    torch.nn.Linear(128, 2))
# if GPU is available, move the model to GPU
if torch.cuda.is_available:
    #resnet50 = resnet50.to(device)
    resnet50 = resnet50.cuda()
# specify loss function (categorical cross-entropy)
criterion = torch.nn.CrossEntropyLoss()
#criterion = torch.nn.BCELoss()

# specify optimizer (stochastic gradient descent) and learning r
#rateoptimizer = torch.optim.Adam(resnet50.fc.parameters(), lr=1e-3)
rateoptimizer = torch.optim.Adam(resnet50.fc.parameters(), lr=1e-5)

#rateoptimizer = torch.optim.Adam(resnet50.fc.parameters(), lr=1e-7)
#rateoptimizer = torch.optim.SGD(resnet50.fc.parameters(), lr=1e-3)

In [234]:
img_datasize = {'train':61282.0, 'val':12137.0}

In [235]:
def train_model(model, criterion, optimizer, num_epochs=3):
    bestval_acc = 0.0
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                #inputs = inputs.to(device)
                #inputs = inputs.permute(0,3,1,2).type(torch.FloatTensor).to(device) 
                inputs = inputs.permute(0,1,2,3).type(torch.cuda.FloatTensor).to(device)
                labels = labels.to(device)
                #print("shape: ",inputs.shape)
                

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(outputs, 1)
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / img_datasize[phase]
            epoch_acc = running_corrects.double() / img_datasize[phase]


            print('{} loss: {:.4f}, acc: {:.4f}'.format(phase,
                                                        epoch_loss,
                                                        epoch_acc))
            
            if phase=='val':
                if epoch_acc > bestval_acc:
                    bestval_acc = epoch_acc
                    print("Best model accuracy, saving the best model")
                    torch.save(model.state_dict(), 'models/pytorch/modeltrain_weights_Adam.h5')
    return model

In [236]:
resnet50model_trained = train_model(resnet50, criterion, rateoptimizer, num_epochs=50)

Epoch 1/50
----------
train loss: 0.5964, acc: 0.6837
val loss: 0.7124, acc: 0.5494
Best model accuracy, saving the best model
Epoch 2/50
----------
train loss: 0.5297, acc: 0.7387
val loss: 0.7500, acc: 0.5269
Epoch 3/50
----------
train loss: 0.5147, acc: 0.7493
val loss: 0.7591, acc: 0.5241
Epoch 4/50
----------
train loss: 0.5075, acc: 0.7541
val loss: 0.8088, acc: 0.5000
Epoch 5/50
----------
train loss: 0.5018, acc: 0.7577
val loss: 0.7570, acc: 0.5304
Epoch 6/50
----------
train loss: 0.5000, acc: 0.7609
val loss: 0.8185, acc: 0.5086
Epoch 7/50
----------
train loss: 0.4973, acc: 0.7623
val loss: 0.7997, acc: 0.5151
Epoch 8/50
----------
train loss: 0.4968, acc: 0.7613
val loss: 0.8400, acc: 0.4904
Epoch 9/50
----------
train loss: 0.4971, acc: 0.7607
val loss: 0.8035, acc: 0.5178
Epoch 10/50
----------
train loss: 0.4933, acc: 0.7634
val loss: 0.8265, acc: 0.5109
Epoch 11/50
----------
train loss: 0.4905, acc: 0.7633
val loss: 0.8144, acc: 0.5134
Epoch 12/50
----------
train lo

In [30]:
torch.save(resnet50model_trained.state_dict(), 'models/pytorch/weights.h5')

In [31]:
### Test Prediction
test_df={'Name':[], 'Diagnosis':[]}
i=0
with torch.no_grad():
    correct = 0
    total = 0
    for images, names in dataloaders['test']:
        images = images.to(device)

        images=images.permute(0,3,1,2)
        outputs = resnet50model_trained(images.float()).cpu()
        _, predicted = torch.max(outputs.data, 1)

        test_df['Name'].extend(names)
        test_df['Diagnosis'].extend(predicted.detach().numpy())
        i+=1
        print('Test images {}/{} done.'.format(i*32,len(dataloaders['test'])*32),end='\r') 

test_preds = pd.DataFrame(test_df)
test_filename='test_preds.csv'
#model_path="Model/final_model.pt"
test_preds.to_csv(test_filename, index = False)
print("File {} saved successfully!".format(test_filename))
#torch.save(model,model_path)
#print("Model saved successfully!\nPath: {}".format(model_path))

File test_preds.csv saved successfully!


In [22]:
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2).to(device)
# Freeze training for all “features” layers
for param in model.parameters():
    param.requires_grad = False
n_inputs = model.fc.in_features
# add last linear layer (n_inputs -> 3 cancer classes)
# new layers automatically have requires_grad = True
model.fc = torch.nn.Sequential(
    torch.nn.Linear(n_inputs, 128),
    torch.nn.ReLU(inplace=True),
    torch.nn.Linear(128, 2))
# if GPU is available, move the model to GPU
if torch.cuda.is_available:
    model = model.cuda()
model.load_state_dict(torch.load('models/pytorch/modeltrain_weights_SGD.h5'))

<All keys matched successfully>

In [23]:
### Test Prediction
test_df={'Name':[], 'Diagnosis':[]}
i=0
with torch.no_grad():
    correct = 0
    total = 0
    for images, names in dataloaders['test']:
        images = images.to(device)

        images=images.permute(0,3,1,2)
        outputs = model(images.float()).cpu()
        _, predicted = torch.max(outputs.data, 1)

        test_df['Name'].extend(names)
        test_df['Diagnosis'].extend(predicted.detach().numpy())
        i+=1
        print('Test images {}/{} done.'.format(i*32,len(dataloaders['test'])*32),end='\r') 

test_preds = pd.DataFrame(test_df)
test_filename='test_preds1.csv'
#model_path="Model/final_model.pt"
test_preds.to_csv(test_filename, index = False)
print("File {} saved successfully!".format(test_filename))
#torch.save(model,model_path)
#print("Model saved successfully!\nPath: {}".format(model_path))

File test_preds1.csv saved successfully!
