In [None]:
import random
import os
import pandas as pd
import numpy as np

from PIL import Image
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
import torchvision
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
from tqdm import tqdm_notebook
import torch.optim as optim
from tqdm import tqdm

def imgRead(in_path):
    r_img = Image.open(in_path+'_red.png')
    g_img = Image.open(in_path+'_green.png')
    b_img = Image.open(in_path+'_blue.png')
    y_img = Image.open(in_path+'_yellow.png')
    rgb_arr = np.stack([r_img, g_img, b_img, y_img], -1)
    return Image.fromarray(rgb_arr)

class ProteinDataset(Dataset):
    def __init__(self, df, transform):
        self.imgnames = df['Id'].tolist()
        self.labels = None if 'target_vec' not in df.columns else np.array(df.target_vec.values.tolist(), dtype = np.float32)
        self.transform = transform
    def __len__(self):
        return len(self.imgnames)
    def __getitem__(self, idx):
        img_path = '/Users/csh/Desktop/kaggle/input/train/' + self.imgnames[idx]
        image = imgRead(img_path)
        if self.labels is not None:
            label = self.labels[idx]
            return self.transform(image), torch.from_numpy(label)
        
        return self.transform(image)
class ProteinTestDataset(Dataset):
    def __init__(self, df, transform):
        self.imgnames = df['Id'].tolist()
        self.labels = None if 'target_vec' not in df.columns else np.array(df.target_vec.values.tolist(), dtype = np.float32)
        self.transform = transform
    def __len__(self):
        return len(self.imgnames)
    def __getitem__(self, idx):
        img_path = '/Users/csh/Desktop/kaggle/input/test/' + self.imgnames[idx]
        image = imgRead(img_path)
        
        if self.labels is not None:
            label = self.labels[idx]
            return self.transform(image), torch.from_numpy(label)
        
        return self.transform(image), self.imgnames[idx]
    
def fetch_dataloader(types, data_dir):

    assert set(types) <= set(
        ['train', 'val', 'test']), "data types have to be among {'train', 'val', 'test'}"

    train_transformer = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomRotation(20.0),
        transforms.RandomHorizontalFlip(),  
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.00505, 0.00331, 0.00344, 0.00519],
                             std=[0.10038, 0.08131, 0.08284, 0.10179])
])  

    eval_transformer = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.00505, 0.00331, 0.00344, 0.00519],
                             std=[0.10038, 0.08131, 0.08284, 0.10179])
])  

    if 'test' in types:
        test_image_dir = data_dir
        test_df = pd.read_csv('/Users/csh/Desktop/kaggle/input/sample_submission.csv')

    if 'train' in types:
        train_image_dir = data_dir

        train_df = pd.read_csv('/Users/csh/Desktop/kaggle/input/train.csv')

        train_df['target_list'] = train_df['Target'].map(
            lambda x: [int(a) for a in x.split(' ')])

        # create a categorical vector
        train_df['target_vec'] = train_df['target_list'].map(
            lambda ck: [int(i in ck) for i in range(28)])

        raw_train_df, valid_df = train_test_split(train_df,
                            test_size=0.3,
                          # hack to make stratification work
                            stratify=train_df['target_list'].map(lambda x: np.random.choice(x) if 27 not in x else 27))

        # keep labels with more than 500 objects
        out_df_list = []
        for k in range(28):
            keep_rows = raw_train_df['target_list'].map(lambda x: k in x)
            out_df_list += [raw_train_df[keep_rows].sample(1000,
                                                           replace=True)]
        train_df = pd.concat(out_df_list, ignore_index=True)

    dataloaders = {}
    for split in set(types):
        # use the train_transformer if training data, else use eval_transformer without random flip
        if split == 'train':
            dl = DataLoader(ProteinDataset(train_df, train_transformer),
                            batch_size=36,
                            shuffle=True,
                            num_workers=4)
            dataloaders[split] = dl
        elif split == 'val':
            dl = DataLoader(ProteinTestDataset(valid_df, eval_transformer),
                            batch_size=4,
                            shuffle=False,
                            num_workers=0)
            dataloaders[split] = dl
        else:
            dl = DataLoader(ProteinTestDataset(test_df, eval_transformer),
                            batch_size=4,
                            shuffle=False,
                            num_workers=0)
            dataloaders[split] = dl

    return dataloaders


if __name__ == "__main__":
    dataloaders = fetch_dataloader(['train', 'val'], '/Users/csh/Desktop/kaggle/input/train')
    train_dl = dataloaders['train']
    val_dl = dataloaders['val']
    testloaders = fetch_dataloader(['test'], '/Users/csh/Desktop/kaggle/input/test')
    test_dl = testloaders['test']
    device = 'cuda'
    net = torchvision.models.resnet18()
    net.conv1 = nn.Conv2d(4, 64, kernel_size = (7, 7), stride= (2, 2), padding=(3, 3), bias=False)
    net.fc = nn.Linear(512 , 28)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), lr=5e-4)
    for epoch in range(1):
        with tqdm(total = len(train_dl)) as t:
            running_loss = 0.0
            for i, (train_batch, labels_batch) in enumerate(train_dl):
                train_batch = train_batch.to(device), labels_batch = labels_batch.to(device)
                optimizer.zero_grad()
                output_batch = net(train_batch)
                loss = loss_fn(output_batch, labels_batch)
                loss.backward()
                optimizer.step()
                torch.save(net, '/Users/csh/Desktop/dna2.pt')
                running_loss += loss.item()
                if( i% 100 == 99):
                    print('[%d, %5d] loss: %.3f' %
                     (epoch + 1, i+1, running_loss / 100))
                    running_loss = 0.0
     
                t.update()
    runnling_loss = 0.0
    with tqdm(total = len(val_dl)) as t:
        runnling_loss = 0.0
        for i,(data_batch, labels_batch) in enumerate(val_dl):
            output_batch = net(data_batch)
            loss = loss_fn(output_batch, labels_batch)
            running_loss += loss.item()
    print(running_loss)

    print('Finished Training')

    torch.save(net, '/Users/csh/Desktop/dna2.pt')
    out = []
    for data_batch in tqdm(test_dl):
        data_batch = data_batch.to(device)

        output_batch = net(data_batch)

        output_batch = output_batch.data.cpu().numpy()
        output_batch = (output_batch > -1.0).astype(np.int32)
        for i in range(output_batch.shape[0]):
            output_batch_str = ' '.join(str(v) for v in np.nonzero(ouput_batch[i])[0].tolist())

            ouput.append(output_batch_str)

    test_df = pd.read_csv('sample_submission.csv')
    test_df.Predicted = out
    test_df.to_csv('dna_submission.csv', index = False)
