### Adapted from Pytorch transfer learning [tutorial]( https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms, utils
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import numpy as np
import time
import os
import copy
import torch.nn.functional as F
from PIL import Image, ExifTags
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
class Wheat(Dataset):
    def __init__(self, imgs, gts, split_type, transform):
        self.imgs = imgs
        self.gts = gts
                   
        self.split_type = split_type
        self.transform = transform
    
    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img = self.imgs[idx]
        if self.split_type == 'test':
            y = 0
        else:
            y = self.gts[idx]
        img = Image.fromarray(img)
        img = self.transform(img)
        return img, y

In [3]:
def read_dataset(imgs_path, lbl_path):
    lbl_df = pd.read_csv(lbl_path)
    gt = lbl_df['growth_stage'].to_numpy().astype(np.float32)
    lbl_quality = lbl_df['label_quality'].to_numpy()
    train_imgs_fname_set = lbl_df['UID'].tolist()
    
    imgs_fname_arr = os.listdir(imgs_path)
    test_id_arr = []
    imgs_arr = np.zeros((lbl_df.shape[0], 224, 224, 3), dtype = np.uint8)
    test_imgs_arr = np.zeros((len(imgs_fname_arr) - lbl_df.shape[0], 224, 224, 3), dtype = np.uint8)
    
    i = 0
    j = 0
    for fname in tqdm(imgs_fname_arr):
        img = np.array(Image.open(os.path.join(imgs_path, fname)).convert('RGB').resize((224,224), Image.ANTIALIAS)).astype(np.uint8)
        img_id = fname.split('.')[0] 
        if img_id in train_imgs_fname_set:
            imgs_arr[i] = img
            i += 1
        else:
            test_imgs_arr[j] = img
            j += 1
            test_id_arr.append(img_id)
            
    return imgs_arr, test_imgs_arr, gt, lbl_quality, test_id_arr

In [4]:
#change imgs_path and lbl_path with your paths
imgs_arr, test_imgs_arr, gt, lbl_quality, test_id_arr = read_dataset('Data/Images/', 'Data/Train.csv')

100%|███████████████████████████████████████████████████████████████████████████| 14253/14253 [02:16<00:00, 104.10it/s]


In [5]:
#Use only high quality data
imgs_arr = imgs_arr[lbl_quality == 2]
gt = gt[lbl_quality == 2]
lbl_quality = lbl_quality[lbl_quality == 2]

In [6]:
def train_model(model, criterion, optimizer, scheduler, device, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 1000000.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels.unsqueeze(1))

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]

            print('{} RMSE Loss: {:.4f}'.format(
                phase, np.sqrt(epoch_loss)))

            # deep copy the model
            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val RMSE Loss: {:4f}'.format(np.sqrt(best_loss)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [7]:
#split the data into 80% training and 20% validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1234)

for train_index, val_index in sss.split(imgs_arr, gt):
    break

train_imgs_arr = imgs_arr[train_index]
train_gt = gt[train_index]
val_imgs_arr = imgs_arr[val_index]
val_gt = gt[val_index]

In [8]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

image_datasets = {'train': Wheat(train_imgs_arr, train_gt, 'train', data_transforms['train']),
                  'val': Wheat(val_imgs_arr, val_gt, 'val', data_transforms['val'])}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

device = torch.device("cuda:0")
print(torch.cuda.is_available())

True


In [None]:
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features

model_ft.fc = nn.Linear(num_ftrs, 1)

model_ft = model_ft.to(device)

criterion = nn.MSELoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, device,
                       num_epochs=25)

Epoch 0/24
----------


In [None]:
def test(model, test_loader, device):
    model.eval()
    res_arr = []
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        with torch.set_grad_enabled(False):
            outputs = model(inputs)   
            res_arr.append(outputs.detach().cpu().numpy())
    res_arr = np.concatenate(res_arr, axis = 0)
    return res_arr

In [None]:
image_datasets['test'] = Wheat(test_imgs_arr, None, 'test', data_transforms['val'])
test_loader = torch.utils.data.DataLoader(image_datasets['test'], batch_size=4,shuffle=False, num_workers=16)
test_pred = test(model_ft, test_loader, device)

In [None]:
sub = pd.read_csv('SampleSubmission.csv')
sub['UID'] = test_id_arr
sub['growth_stage'] = test_pred.flatten().tolist()
sub.to_csv('high_quality_data_resnet18_sub.csv', index = False)