In [3]:
import sys

In [4]:
# !kaggle competitions download -c shopee-product-matching
# !unzip shopee-product-matching.zip

In [5]:
#!{sys.executable} -m pip uninstall timm -y

In [6]:
# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Visuals and CV2
import cv2

# albumentations for augs
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

from sklearn.model_selection import train_test_split

#torch
import torch
import timm
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

In [18]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

DIM = (224, 224)

NUM_WORKERS = 16
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 100
SEED = 2020
LR = 1e-3

device = torch.device('cuda')


################################################# MODEL ####################################################################

model_name = 'swin_small_patch4_window7_224' #efficientnet_b0-b7

################################################ Metric Loss and its params #######################################################
loss_module = 'arcface' #'cosface' #'adacos'
s = 30.0
m = 0.5 
ls_eps = 0.0
easy_margin = False


####################################### Scheduler and its params ############################################################
SCHEDULER = 'CosineAnnealingWarmRestarts' #'CosineAnnealingLR'
#SCHEDULER = 'ReduceLROnPlateau'
factor=0.2 # ReduceLROnPlateau
patience=3 # ReduceLROnPlateau
eps=1e-6 # ReduceLROnPlateau
T_max=10 # CosineAnnealingLR
T_0=7 # CosineAnnealingWarmRestarts
min_lr=1e-6

############################################## Model Params ###############################################################


In [8]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

In [9]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [10]:
def fetch_scheduler(optimizer):
    if SCHEDULER =='ReduceLROnPlateau':
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=factor, patience=patience, verbose=True, eps=eps)
    elif SCHEDULER =='CosineAnnealingLR':
        scheduler = CosineAnnealingLR(optimizer, T_max=T_max, eta_min=min_lr, last_epoch=-1)
    elif SCHEDULER =='CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=1, eta_min=min_lr, last_epoch=-1)
    return scheduler

In [11]:
class FocalLoss(nn.modules.loss._WeightedLoss):
    def __init__(self, weight=None, gamma=2,reduction='mean'):
        super(FocalLoss, self).__init__(weight,reduction=reduction)
        self.gamma = gamma
        self.weight = weight #weight parameter will act as the alpha parameter to balance class weights

    def forward(self, input, target):
        ce_loss = F.cross_entropy(input, target,reduction=self.reduction,weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss

In [12]:
def fetch_loss():
    loss = nn.CrossEntropyLoss()
#     loss = FocalLoss()
    return loss

In [13]:
# def get_train_transforms():
#     return albumentations.Compose(
#         [   
#             albumentations.Resize(DIM[0],DIM[1],always_apply=True),
#             albumentations.HorizontalFlip(p=0.5),
#             albumentations.VerticalFlip(p=0.5),
#             albumentations.Rotate(limit=120, p=0.8),
#             albumentations.RandomBrightness(limit=(0.09, 0.6), p=0.5),
#             albumentations.Cutout(num_holes=8, max_h_size=4, max_w_size=4, fill_value=0, always_apply=False, p=0.5),
# #             albumentations.ShiftScaleRotate(
# #                shift_limit=0.25, scale_limit=0.1, rotate_limit=0
# #             ),
#             albumentations.Normalize(),
#             ToTensorV2(p=1.0),
#         ]
#     )


from albumentations import (
    HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine,
    IAASharpen, IAAEmboss, RandomContrast, RandomBrightness, Flip, OneOf, Compose, RandomGamma, ElasticTransform, ChannelShuffle,RGBShift, Rotate
)

def get_train_transforms():
    return Compose([
        albumentations.Resize(DIM[0],DIM[1],always_apply=True),
        RandomRotate90(),
        Flip(),
        Transpose(),
        OneOf([
            IAAAdditiveGaussianNoise(),
            GaussNoise(),
        ], p=0.2),
        OneOf([
            MotionBlur(p=.2),
            MedianBlur(blur_limit=3, p=.1),
            Blur(blur_limit=3, p=.1),
        ], p=0.2),
        ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=.2),
        OneOf([
            OpticalDistortion(p=0.3),
            GridDistortion(p=.1),
            IAAPiecewiseAffine(p=0.3),
        ], p=0.2),
        OneOf([
            CLAHE(clip_limit=2),
            IAASharpen(),
            IAAEmboss(),
            RandomContrast(),
            RandomBrightness(),
        ], p=0.3),
        albumentations.Normalize(),
        ToTensorV2(p=1.0),
        #HueSaturationValue(p=0.3),
    ]
    )

def get_valid_transforms():

    return albumentations.Compose(
        [
            albumentations.Resize(DIM[0],DIM[1],always_apply=True),
            albumentations.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

In [14]:
def rmac(x, L=3, eps=1e-6):
    ovr = 0.4 # desired overlap of neighboring regions
    steps = torch.Tensor([2, 3, 4, 5, 6, 7]) # possible regions for the long dimension

    W = x.size(3)
    H = x.size(2)

    w = min(W, H)
    w2 = math.floor(w/2.0 - 1)

    b = (max(H, W)-w)/(steps-1)
    (tmp, idx) = torch.min(torch.abs(((w**2 - w*b)/w**2)-ovr), 0) # steps(idx) regions for long dimension

    # region overplus per dimension
    Wd = 0;
    Hd = 0;
    if H < W:  
        Wd = idx.item() + 1
    elif H > W:
        Hd = idx.item() + 1

    v = F.max_pool2d(x, (x.size(-2), x.size(-1)))
    v = v / (torch.norm(v, p=2, dim=1, keepdim=True) + eps).expand_as(v)

    for l in range(1, L+1):
        wl = math.floor(2*w/(l+1))
        wl2 = math.floor(wl/2 - 1)

        if l+Wd == 1:
            b = 0
        else:
            b = (W-wl)/(l+Wd-1)
        cenW = torch.floor(wl2 + torch.Tensor(range(l-1+Wd+1))*b) - wl2 # center coordinates
        if l+Hd == 1:
            b = 0
        else:
            b = (H-wl)/(l+Hd-1)
        cenH = torch.floor(wl2 + torch.Tensor(range(l-1+Hd+1))*b) - wl2 # center coordinates
            
        for i_ in cenH.tolist():
            for j_ in cenW.tolist():
                if wl == 0:
                    continue
                R = x[:,:,(int(i_)+torch.Tensor(range(wl)).long()).tolist(),:]
                R = R[:,:,:,(int(j_)+torch.Tensor(range(wl)).long()).tolist()]
                vt = F.max_pool2d(R, (R.size(-2), R.size(-1)))
                vt = vt / (torch.norm(vt, p=2, dim=1, keepdim=True) + eps).expand_as(vt)
                v += vt

    return v


def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
    # return F.lp_pool2d(F.threshold(x, eps, eps), p, (x.size(-2), x.size(-1))) # alternative


    
class RMAC(nn.Module):

    def __init__(self, L=3, eps=1e-6):
        super(RMAC,self).__init__()
        self.L = L
        self.eps = eps

    def forward(self, x):
        return rmac(x, L=self.L, eps=self.eps)
        
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'L=' + '{}'.format(self.L) + ')'
    
    
class GeM(nn.Module):

    def __init__(self, p=4, eps=1e-6):
        super(GeM,self).__init__()
        self.p = Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)
        
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'

In [15]:
class ShopeeDataset(Dataset):
    def __init__(self, csv, transforms=None):

        self.csv = csv.reset_index()
        self.augmentations = transforms

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        #text = row.title
        
        image = cv2.imread(row.filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
        
        
        return image,torch.tensor(row.label_group)

In [69]:
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='efficientnet_b0',
#                  temperature=1,
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0,
                 loss_module='softmax',
                 s=30.0,
                 margin=0.50,
                 ls_eps=0.0,
                 theta_zero=0.785,
                 pretrained=True):
        
        super(ShopeeNet, self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))
        
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        if model_name.startswith('xception'):
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        if model_name.startswith('efficientnet'):
            print("EFFNET")
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif model_name.startswith('inception_resnet'):
            print("INCEPTIONRESNET")
            final_in_features = self.backbone.classif.in_features
            self.backbone.classif = nn.Identity()
        elif 'nfnet' in model_name:
            print("NFNET")
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head = nn.Identity()
        elif 'swin' in model_name:
            print("SWIN")
            final_in_features = fc_dim
            self.backbone.head = nn.Linear(self.backbone.head.in_features, fc_dim)
            nn.init.xavier_normal_(self.backbone.head.weight)
            nn.init.constant_(self.backbone.head.bias, 0)
            use_fc = False
        elif model_name.startswith('densenet'):
            print("DENSENET")
            final_in_features = self.backbone.head.in_features
            self.backbone.head = nn.Identity()
        
        self.backbone.global_pool = nn.Identity()
#         self.temperature = nn.Parameter(torch.ones(1) * temperature)
        
#         self.rmac_pooling = RMAC()
#         self.gem_pooling = GeM()
        self.pooling =  nn.AdaptiveAvgPool2d(1)
            
        self.use_fc = use_fc
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(2*final_in_features, fc_dim)
            self.bn1 = nn.BatchNorm2d(final_in_features)
            self.bn2 = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim

        self.loss_module = loss_module
        if loss_module == 'arcface':
            self.final = ArcMarginProduct(final_in_features, n_classes,
                                          s=s, m=margin, easy_margin=False, ls_eps=ls_eps)
        elif loss_module == 'softmax':
            self.final = nn.Linear(final_in_features, n_classes)

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn1.weight, 1)
        nn.init.constant_(self.bn1.bias, 0)
        nn.init.constant_(self.bn2.weight, 1)
        nn.init.constant_(self.bn2.bias, 0)

    def forward(self, x, label):
        feature = self.extract_feat(x)
        if self.loss_module == 'arcface':
            logits = self.final(feature, label)
        else:
            logits = self.final(feature)
        return logits

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        print(x.size())
#         x = self.bn1(x)
        
#         gem_x = self.gem_pooling(x).view(batch_size, -1)
#         rmac_x = self.rmac_pooling(x).view(batch_size, -1)
#         x = torch.cat([rmac_x, gem_x], axis=1)
#         x = self.pooling(x).view(batch_size, -1)
        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn2(x)
            
        x = F.normalize(x)

        return x
    
    def temperature_scale(self, logits):
        """
        Perform temperature scaling on logits
        """
        # Expand temperature to match the size of logits
        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1))
        return logits / temperature

    # This function probably should live outside of this class, but whatever
    def set_temperature(self, valid_loader):
        """
        Tune the tempearature of the model (using the validation set).
        We're going to set it to optimize NLL.
        valid_loader (DataLoader): validation set loader
        """
        self.cuda()
        nll_criterion = nn.CrossEntropyLoss().cuda()
        ece_criterion = _ECELoss().cuda()

        # First: collect all the logits and labels for the validation set
        logits_list = []
        labels_list = []
        with torch.no_grad():
            for input, label in valid_loader:
                input = input.cuda()
                logits = self.model(input)
                logits_list.append(logits)
                labels_list.append(label)
            logits = torch.cat(logits_list).cuda()
            labels = torch.cat(labels_list).cuda()

        # Calculate NLL and ECE before temperature scaling
        before_temperature_nll = nll_criterion(logits, labels).item()
        before_temperature_ece = ece_criterion(logits, labels).item()
        print('Before temperature - NLL: %.3f, ECE: %.3f' % (before_temperature_nll, before_temperature_ece))

        # Next: optimize the temperature w.r.t. NLL
        optimizer = optim.LBFGS([self.temperature], lr=0.01, max_iter=50)


In [70]:
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps  # label smoothing
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output
    
class _ECELoss(nn.Module):
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(_ECELoss, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels):
        softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)

        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [71]:
def train_fn(dataloader,model,criterion,optimizer,device,scheduler,epoch):
    model.train()
    loss_score = AverageMeter()
    
    tk0 = tqdm(enumerate(dataloader), total=len(dataloader))
    for bi,d in tk0:
        
        batch_size = d[0].shape[0]


        images = d[0]
        targets = d[1]

        images = images.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()

        output = model(images,targets)
        
        loss = criterion(output,targets)
        
        loss.mean().backward()
        optimizer.step()
        
        loss_score.update(loss.detach().item(), batch_size)
        tk0.set_postfix(Train_Loss=loss_score.avg,Epoch=epoch,LR=optimizer.param_groups[0]['lr'])
        
    if scheduler is not None:
            scheduler.step()
        
    return loss_score


def eval_fn(data_loader,model,criterion,device,scheduler):
    
    loss_score = AverageMeter()
    
    model.eval()
    tk0 = tqdm(enumerate(data_loader), total=len(data_loader))
    
    with torch.no_grad():
        
        for bi,d in tk0:
            batch_size = d[0].size()[0]

            image = d[0]
            targets = d[1]

            image = image.to(device)
            targets = targets.to(device)

            output = model(image,targets)

            loss = criterion(output,targets)
            
            loss_score.update(loss.mean().detach().item(), batch_size)
            tk0.set_postfix(Eval_Loss=loss_score.avg)
            
#         if scheduler is not None:
#             scheduler.step(loss.detach().item())
            
    return loss_score

In [72]:
data = pd.read_csv('folds.csv')
data['filepath'] = data['image'].apply(lambda x: os.path.join('train_images', x))
# data = pd.read_csv('data_plus_ozon_folds.csv')

In [73]:
encoder = LabelEncoder()
data['label_group'] = encoder.fit_transform(data['label_group'])

# labels = data.label_group.unique()
# train_labels = np.random.choice(labels, int(0.8*len(labels)))
# train, valid = train_test_split(data, shuffle = True, random_state = 2020)
# train = train.reset_index(drop=True)
# valid = valid.reset_index(drop=True)

In [74]:
model_params = {
    'n_classes':11014,
    'model_name':model_name,
#     'temperature': 2,
    'use_fc':True,
    'fc_dim':512,
    'dropout':0.0,
    'loss_module':'arcface',
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785,
    'pretrained':True
}

In [75]:
def run(use_checkpoint=False):
        
#     train = data[data['fold']!=0].reset_index(drop=True)
#     train = data.reset_index(drop=True)
#     valid = data[data['fold']==0].reset_index(drop=True)
    train, valid = train_test_split(data, test_size=0.05, random_state=42)
    train = train.reset_index(drop=True)
    valid = valid.reset_index(drop=True)
    

#     train = data[data.label_group.isin(train_labels)].reset_index(drop=True)
#     valid = data[~data.label_group.isin(train_labels)].reset_index(drop=True)
    # Defining DataSet
    train_dataset = ShopeeDataset(
        csv=train,
        transforms=get_train_transforms(),
    )
        
    valid_dataset = ShopeeDataset(
        csv=valid,
        transforms=get_valid_transforms(),
    )
        
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True,
        pin_memory=True,
        drop_last=True,
        num_workers=NUM_WORKERS
    )
    
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=NUM_WORKERS,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )
    
    # Defining Device
    device = torch.device("cuda")
    
    # Defining Model for specific fold
    model = ShopeeNet(**model_params)
    if use_checkpoint:
        print("LOADING MODEL...")
        model.load_state_dict(torch.load('model_eca_nfnet_l0_IMG_SIZE_512_arcface_6.268_FullData_8GPUS.bin'))
    
    model.to(device)
    
    parallel_model = torch.nn.DataParallel(model)
    
#     for name, param in model.named_parameters():
#         if name.startswith('backbone'):
#             param.requires_grad = False 
    
    
    #DEfining criterion
    criterion = fetch_loss()
    criterion.to(device)
        
    # Defining Optimizer with weight decay to params other than bias and layer norms
    param_optimizer = list(parallel_model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
            ]  
    
    optimizer = torch.optim.Adam(optimizer_parameters, lr=LR)
    
    #Defining LR SCheduler
    scheduler = fetch_scheduler(optimizer)
        
    # THE ENGINE LOOP
    best_loss = 10000
    for epoch in range(EPOCHS):
        train_loss = train_fn(train_loader, parallel_model, criterion, optimizer, device,scheduler=scheduler,epoch=epoch)
        
        valid_loss = eval_fn(valid_loader, parallel_model, criterion,device,scheduler=scheduler)
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            torch.save(parallel_model.module.state_dict(),f'SWINTRANS_{DIM[0]}_{loss_module}_{best_loss:.3f}_RMAC_GEM.bin')
            print('best model found for epoch {}'.format(epoch))

In [76]:
use_checkpoint=False
run(use_checkpoint)

Building Model Backbone for swin_small_patch4_window7_224 model
SWIN






  0%|          | 0/8134 [00:00<?, ?it/s][A[A[A[A



  0%|          | 0/8134 [00:00<?, ?it/s, Epoch=0, LR=0.001, Train_Loss=23.6][A[A[A[A



  0%|          | 1/8134 [00:00<54:07,  2.50it/s, Epoch=0, LR=0.001, Train_Loss=23.6][A[A[A[A

torch.Size([4, 512])






  0%|          | 1/8134 [00:00<54:07,  2.50it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 2/8134 [00:00<45:09,  3.00it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 2/8134 [00:00<45:09,  3.00it/s, Epoch=0, LR=0.001, Train_Loss=24.9][A[A[A[A



  0%|          | 3/8134 [00:00<36:57,  3.67it/s, Epoch=0, LR=0.001, Train_Loss=24.9][A[A[A[A



  0%|          | 3/8134 [00:00<36:57,  3.67it/s, Epoch=0, LR=0.001, Train_Loss=24.7][A[A[A[A



  0%|          | 4/8134 [00:00<30:47,  4.40it/s, Epoch=0, LR=0.001, Train_Loss=24.7][A[A[A[A



  0%|          | 4/8134 [00:00<30:47,  4.40it/s, Epoch=0, LR=0.001, Train_Loss=24.8][A[A[A[A



  0%|          | 5/8134 [00:00<26:23,  5.13it/s, Epoch=0, LR=0.001, Train_Loss=24.8][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 5/8134 [00:01<26:23,  5.13it/s, Epoch=0, LR=0.001, Train_Loss=24.7][A[A[A[A



  0%|          | 6/8134 [00:01<23:30,  5.76it/s, Epoch=0, LR=0.001, Train_Loss=24.7][A[A[A[A



  0%|          | 6/8134 [00:01<23:30,  5.76it/s, Epoch=0, LR=0.001, Train_Loss=24.6]

torch.Size([4, 512])
torch.Size([4, 512])


[A[A[A[A



  0%|          | 7/8134 [00:01<21:17,  6.36it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 7/8134 [00:01<21:17,  6.36it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 8/8134 [00:01<19:42,  6.87it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 8/8134 [00:01<19:42,  6.87it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 9/8134 [00:01<18:48,  7.20it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 9/8134 [00:01<18:48,  7.20it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 10/8134 [00:01<17:57,  7.54it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 10/8134 [00:01<17:57,  7.54it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 11/8134 [00:01<17:22,  7.79it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 11/8134 [00:01<17:22,  7.79it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 12/8134 [00:01<17:09,  7.89it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A





torch.Size([4, 512])
torch.Size([4, 512])


  0%|          | 12/8134 [00:01<17:09,  7.89it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 13/8134 [00:01<16:52,  8.02it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 13/8134 [00:02<16:52,  8.02it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 14/8134 [00:02<16:38,  8.13it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 14/8134 [00:02<16:38,  8.13it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 15/8134 [00:02<16:43,  8.09it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 15/8134 [00:02<16:43,  8.09it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 16/8134 [00:02<16:31,  8.19it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 16/8134 [00:02<16:31,  8.19it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 17/8134 [00:02<16:22,  8.26it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 17/8134 [00:02<16:22,  8.26it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 18/8134 [00:02<16:26,  8.22it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 18/8134 [00:02<16:26,  8.22it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 19/8134 [00:02<16:16,  8.31it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 19/8134 [00:02<16:16,  8.31it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 20/8134 [00:02<16:12,  8.34it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 20/8134 [00:02<16:12,  8.34it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 21/8134 [00:02<16:42,  8.09it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 21/8134 [00:03<16:42,  8.09it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 22/8134 [00:03<16:59,  7.96it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 22/8134 [00:03<16:59,  7.96it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 23/8134 [00:03<17:06,  7.90it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 23/8134 [00:03<17:06,  7.90it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 24/8134 [00:03<17:36,  7.67it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 24/8134 [00:03<17:36,  7.67it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 25/8134 [00:03<17:31,  7.71it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 25/8134 [00:03<17:31,  7.71it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 26/8134 [00:03<17:14,  7.84it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 26/8134 [00:03<17:14,  7.84it/s, Epoch=0, LR=0.001, Train_Loss=24.5]

torch.Size([4, 512])
torch.Size([4, 512])


[A[A[A[A



  0%|          | 27/8134 [00:03<16:57,  7.97it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 27/8134 [00:03<16:57,  7.97it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 28/8134 [00:03<16:46,  8.06it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 28/8134 [00:03<16:46,  8.06it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 29/8134 [00:03<16:34,  8.15it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 29/8134 [00:04<16:34,  8.15it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A



  0%|          | 30/8134 [00:04<16:30,  8.18it/s, Epoch=0, LR=0.001, Train_Loss=24.6][A[A[A[A





torch.Size([4, 512])
torch.Size([4, 512])


  0%|          | 30/8134 [00:04<16:30,  8.18it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 31/8134 [00:04<16:27,  8.20it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 31/8134 [00:04<16:27,  8.20it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 32/8134 [00:04<16:27,  8.20it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 32/8134 [00:04<16:27,  8.20it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 33/8134 [00:04<16:35,  8.14it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 33/8134 [00:04<16:35,  8.14it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 34/8134 [00:04<16:32,  8.16it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 34/8134 [00:04<16:32,  8.16it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 35/8134 [00:04<16:24,  8.23it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 35/8134 [00:04<16:24,  8.23it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 36/8134 [00:04<16:28,  8.19it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 36/8134 [00:04<16:28,  8.19it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 37/8134 [00:04<16:30,  8.17it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 37/8134 [00:05<16:30,  8.17it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 38/8134 [00:05<17:13,  7.83it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 38/8134 [00:05<17:13,  7.83it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 39/8134 [00:05<17:38,  7.65it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  0%|          | 39/8134 [00:05<17:38,  7.65it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A



  0%|          | 40/8134 [00:05<17:21,  7.77it/s, Epoch=0, LR=0.001, Train_Loss=24.4][A[A[A[A

torch.Size([4, 512])
torch.Size([4, 512])






  0%|          | 40/8134 [00:05<17:21,  7.77it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  1%|          | 41/8134 [00:05<17:04,  7.90it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  1%|          | 41/8134 [00:05<17:04,  7.90it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A



  1%|          | 42/8134 [00:05<17:48,  7.58it/s, Epoch=0, LR=0.001, Train_Loss=24.5][A[A[A[A

torch.Size([4, 512])


KeyboardInterrupt: 

In [36]:
# backbone = timm.create_model('swin_small_patch4_window7_224', pretrained=True)
# for name, _ in backbone.named_parameters():
#     print(name)

patch_embed.proj.weight
patch_embed.proj.bias
patch_embed.norm.weight
patch_embed.norm.bias
layers.0.blocks.0.norm1.weight
layers.0.blocks.0.norm1.bias
layers.0.blocks.0.attn.relative_position_bias_table
layers.0.blocks.0.attn.qkv.weight
layers.0.blocks.0.attn.qkv.bias
layers.0.blocks.0.attn.proj.weight
layers.0.blocks.0.attn.proj.bias
layers.0.blocks.0.norm2.weight
layers.0.blocks.0.norm2.bias
layers.0.blocks.0.mlp.fc1.weight
layers.0.blocks.0.mlp.fc1.bias
layers.0.blocks.0.mlp.fc2.weight
layers.0.blocks.0.mlp.fc2.bias
layers.0.blocks.1.norm1.weight
layers.0.blocks.1.norm1.bias
layers.0.blocks.1.attn.relative_position_bias_table
layers.0.blocks.1.attn.qkv.weight
layers.0.blocks.1.attn.qkv.bias
layers.0.blocks.1.attn.proj.weight
layers.0.blocks.1.attn.proj.bias
layers.0.blocks.1.norm2.weight
layers.0.blocks.1.norm2.bias
layers.0.blocks.1.mlp.fc1.weight
layers.0.blocks.1.mlp.fc1.bias
layers.0.blocks.1.mlp.fc2.weight
layers.0.blocks.1.mlp.fc2.bias
layers.0.downsample.reduction.weight
lay

In [None]:
# torch.save(backbone.state_dict(),'SWIN.bin')

In [None]:
# model = torch.nn.DataParallel(ShopeeNet(**model_params))
# model.load_state_dict(torch.load('model_efficientnet_b4_IMG_SIZE_512_FullData.bin'))
# model = model.to(device)
# model.eval()

In [None]:
# torch.save(model.module.state_dict(),'model_efficientnet_b3_IMG_SIZE_512_FullData.bin') #!!!! IF use DataParallel