## Imports + device + control randomness

In [1]:
import numpy as np
import random

import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms

import matplotlib.pyplot as plt

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.float
dtype

torch.float32

In [4]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

random.seed(0)
np.random.seed(0)

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

## Load data + "Polish data" - my heuristic

In [5]:
def get_loader(train, batch_size=None):
    '''
    load MNIST dataset
    '''
    
    # Dataset в PyTorch -- это какой-то объект, который оборачивает сырые данные и делает с ними какой-нибудь препроцессинг
    dataset = datasets.MNIST('mnist', train=train, download=True,
        transform=transforms.ToTensor())
    
    if batch_size is None:
        batch_size = 1

    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return loader

train = get_loader(True, 64)
val = get_loader(False, 64)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [6]:
def polish_data(loader, batch_size=None, p=None):
    '''
    list of [X, y] tensors (X = 1 * 1 * 28 * 28, y = 1 * 1) to
    shuffled list of [batch_size * 1 * X * 28, batch_size * 1] 
    but p = % of samples in data -> used to cut data from MNIST
    and changing cut data for better performance
    '''

    if p is None:
        '''
        if p is None -> change nothing 
        '''
        return loader

    assert np.abs(np.sum(p, axis=None) - 1) < 0.0001, 'wrong p'
    p = p*2

    # count of each target -> new len
    cnt = [0] * 2
    all_len = 0
    for X, y in loader:
        y_ = int(y)
        if y_ <= 1:              
            cnt[y_] += 1
            all_len += 1
    new_len = np.min(cnt / p, axis=None)

    # max_target_size for each target
    max_target_size = [int(i) for i in p * new_len]

    # maybe some heuristics to avoid ~zero max_targets_size

    # new lists
    lists = [[] for i in range(2)]
    for X, y in loader:
        y_ = int(y)
        if y_ <= 1:
            if len(lists[y_]) < max_target_size[y_]:
                lists[y_].append(torch.reshape(X, (1, 28, 28)))

    # unite and shuffle
    united = []
    print('cnt_0 =', len(lists[0]))
    print('cnt_1 =', len(lists[1]))
    for y in range(2):
        for X in lists[y]:
            united.append([X, y])
    np.random.shuffle(united)

    # group to batches
    batch_X = []
    batch_y = []
    all_batches = []

    if batch_size is None:
        batch_size = np.sum(max_target_size, axis=None)
    for X, y in united:
        batch_X.append(X)
        batch_y.append(y)
        if len(batch_X) == batch_size:
            tt1 = torch.stack(batch_X)
            tt2 = torch.Tensor(batch_y)
            all_batches.append([tt1, tt2.type(torch.LongTensor)])
            batch_X = []
            batch_y = []
    # no 'cutted-batches'

    return all_batches

In [7]:
'''
MNIST has (by default) approximately equal amount of samples for each target ->
I need to create disbalanced dataset so t_ and v_ are disbalanced data from MNIST
'''
one = 1 # always 1
t_ = polish_data(get_loader(True, batch_size=one), batch_size=one, p=np.array([0.2, 0.8]))
v_ = polish_data(get_loader(False, batch_size=one), batch_size=one, p=np.array([0.5, 0.5]))

cnt_0 = 1685
cnt_1 = 6742
cnt_0 = 980
cnt_1 = 980


In [8]:
'''
Split disbalanced to samples for training (p is None by default) so no transformations there
'''
train = polish_data(t_, batch_size=10)
val = polish_data(v_, batch_size=10)

In [9]:
def accuracy(model, val):
    '''
    accuracy of predictions
    '''
    total = 0
    correct = 0
    for X, y in val:
        res = model(X.to(device))
        res = res.argmax(dim=1)
        total += res.shape[0]
        correct += (res == y.to(device)).sum().item()
    return correct / total

## Basic fit

In [10]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

random.seed(0)
np.random.seed(0)

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [11]:
'''
simple linear regression for better illustration, complex models
(especially with conv layers will converge too fast and wouldn't show advantage of heuristic)
'''
model_base = nn.Sequential(
    nn.Flatten(),
    nn.Linear(28*28, 2)
)

optimizer = torch.optim.Adam(model_base.parameters(), lr=0.001)
criterion = nn.NLLLoss()

In [12]:
for epoch in range(10):
    for X, y in train:
        
        optimizer.zero_grad()
        
        output = model_base(X.to(device))
        loss = criterion(output, y.to(device))
        loss.backward()
        
        optimizer.step()
    
    print(accuracy(model_base, train), accuracy(model_base, val))

0.9621454847513943 0.9280612244897959
0.9635694790554171 0.9306122448979591
0.9635694790554171 0.9311224489795918
0.9641628100154266 0.9311224489795918
0.9642814762074285 0.9311224489795918
0.9642814762074285 0.9311224489795918
0.9642814762074285 0.9311224489795918
0.9642814762074285 0.9311224489795918
0.9642814762074285 0.9311224489795918
0.9642814762074285 0.9311224489795918


In [13]:
print(round(accuracy(model_base, val) * 100, 2), '%', sep='') # poor

93.11%


## My special heuristic

Read description of heuristic via .README

In [14]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

In [15]:
def target_preds_density(model, val):
    ans = [0] * 2
    for X, y in val:
        res = model(X.to(device))
        res = res.argmax(dim=1)
        for i in res:
            ans[i] += 1

    return ans

In [16]:
model_heuristic = nn.Sequential(
    nn.Flatten(),
    nn.Linear(28*28, 2)
).to(device)

# same optimizer and criterion
optimizer = torch.optim.Adam(model_heuristic.parameters(), lr=0.001)
criterion = nn.NLLLoss()

In [17]:
p = None
pr_p = None
for epoch in range(10):
    train = polish_data(t_, 10, p)

    for X, y in train:
        optimizer.zero_grad()
        
        output = model_heuristic(X.to(device))
        loss = criterion(output, y.to(device))
        loss.backward()
        
        optimizer.step()
    
    print(accuracy(model_heuristic, train), accuracy(model_heuristic, val))

    # 1) get percentages for 0..2
    val_preds = target_preds_density(model_heuristic, val)
    p = val_preds / np.sum(val_preds, axis=None)

0.9621454847513943 0.9280612244897959
cnt_0 = 1685
cnt_1 = 2251
0.9498727735368957 0.9566326530612245
cnt_0 = 1685
cnt_1 = 2005
0.9672086720867209 0.9739795918367347
cnt_0 = 1685
cnt_1 = 1870
0.9783098591549296 0.9826530612244898
cnt_0 = 1685
cnt_1 = 1806
0.9828080229226361 0.9887755102040816
cnt_0 = 1685
cnt_1 = 1762
0.9863372093023256 0.9923469387755102
cnt_0 = 1685
cnt_1 = 1737
0.9897660818713451 0.9933673469387755
cnt_0 = 1685
cnt_1 = 1723
0.9905882352941177 0.9948979591836735
cnt_0 = 1685
cnt_1 = 1712
0.9914454277286135 0.9954081632653061
cnt_0 = 1685
cnt_1 = 1709
0.9920353982300885 0.9959183673469387


In [18]:
print(round(accuracy(model_heuristic, val) * 100, 2), '%', sep='') # strong

99.59%
