In [94]:
import requests
from io import BytesIO
from PIL import Image
import os

from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import torch
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import f1_score

from DataHandlers import ImageDataset, InMemDataLoader

In [95]:
class Wrapper():
    def __init__(
                self, 
                model, 
                paths = {'train': 'data/train.csv', 'submit': 'data/submit.csv'},
                transform = None,
                batch_size = 20,
            ):
        
        self.model = model
        self.batch_size = batch_size
        
        if transform is None:
            transform = T.Compose([
                T.Resize(200),
                T.CenterCrop(200),
                T.ToTensor(),
                T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
            ])

        train = pd.read_csv(paths['train'])  
        n = len(train)
        
        submit = pd.read_csv(paths['submit'])
        self.max_submit_id = len(submit)
        
        dataset_types = ['train', 'valid', 'test', 'submit']        
        self.datasets = {
            'train'  : ImageDataset(train[:int(n * 0.8)], transform=transform),
            'valid'  : ImageDataset(train[int(n * 0.8):int(n * 0.9)], transform=transform),
            'test'   : ImageDataset(train[int(n * 0.9):], transform=transform),
            'submit' : ImageDataset(submit, transform=transform)
        }
        
        self.loaders = {
            dataset_type: DataLoader(dataset, self.batch_size, shuffle=(dataset_type=='train'))
            for dataset_type, dataset in self.datasets.items()
        } 
        
    def train(self,):
        print('Started training model')
        self.model.train(self.loaders['train'])
        print('Finished training model\n')
        
    def fscore(self, sample_size=100):
        def _count(loader):
            preds, truth = [], []
            items_count = 0
            for images1, images2, equal in loader:
                preds.append(self.model.predict(images1, images2))
                truth.append(equal)
                items_count += self.batch_size
                if items_count > sample_size: break

            preds = torch.cat(preds)
            truth = torch.cat(truth)

            # convert the predictions to binary labels
            preds_bin = (preds > 0.5).int()

            # compute F1 score
            f1 = f1_score(truth.numpy(), preds_bin.numpy())

            return f1
    
        print('Started calculating f-scores')
        print(f'Train      : {_count(self.loaders["train"]): .3f}%') 
        print(f'Validation : {_count(self.loaders["valid"]): .3f}%') 
        print(f'Test       : {_count(self.loaders["test"]): .3f}%\n') 
    
    def _test_loaders(self):
        to_show = {}
        for loader_type, loader in self.loaders.items():
            image1, image2, _ = next(iter(loader))
            image1 = image1[0]
            image2 = image2[0]
            to_show[loader_type + '-1'] = image1
            to_show[loader_type + '-2'] = image2
        num_images = len(to_show)
        fig, axs = plt.subplots(1, num_images, figsize=(15, 15))

        for i, (name, img) in enumerate(to_show.items()):
            axs[i].imshow(img.numpy().transpose(1,2,0).clip(0, 254), cmap='gray')
            axs[i].set_title(name)
            axs[i].axis('off')

        plt.show()
        
    def save_test_preds(self, path='res.csv'):
        print(f'Started saving test predictions to {path}')
        ids = []
        preds = []
            
        for images1, images2, id_ in tqdm(self.loaders['submit']):
            preds.extend(self.model.predict(images1, images2))
            ids.extend(id_)
            
        all_ids = pd.DataFrame({
            'ID': range(2, self.max_submit_id),
        })
        res = pd.DataFrame({
            'ID': [obj.item() for obj in ids],
            'is_same': [obj.item() for obj in preds]
        }).drop_duplicates()

        res = all_ids.merge(res, on='ID', how='left').fillna(0)
        res.to_csv(path, index=False)
        print(f'Saved test predictions to {path}\n')

In [98]:
class ThresholdClassifier():
    def __init__(self):
        pass
    
    def train(self, loader):
        pass
    
    def predict(self, images1, images2):
        delta = ((images1 - images2) ** 2).mean(dim=(1,2,3))
        preds = (delta < 0.015) * 1
        return preds

In [100]:
model = ThresholdClassifier()
wrapper = Wrapper(model)
wrapper.train()
wrapper.fscore(sample_size=100)
#wrapper.save_test_preds()

Started training model
Finished training model

Started calculating f-scores
Train      :  0.968%
Validation :  0.960%
Test       :  0.970%

