In [None]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

In [None]:
import os

import pandas as pd
import torch
from accelerate import Accelerator
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# init accelerator
accelerator = Accelerator(device_placement=True, fp16=True, mixed_precision='fp16')
device = accelerator.device

EPOCHS = 2
BATCH_SIZE = 4

In [None]:
# example dataset
class CustomImageDataset(Dataset):
    def __init__(self, sentences_path, sep):
        self.sentences = pd.read_csv(sentences_path, sep=sep)['sentence']

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        data = {'sentence': self.sentences[idx]}
        return data

train_dataset = CustomImageDataset('klej_polemo2.0-in/train.tsv', sep='\t')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

test_dataset = CustomImageDataset('klej_polemo2.0-in/dev.tsv', sep='\t')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

# batch = next(iter(test_loader))
# batch

In [None]:
from utils import get_teacher_student_tokenizer
teacher, student, tokenizer = get_teacher_student_tokenizer()

In [None]:
# set accelerator
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

# wyrzucić z wd embedingi i batchnormalization
optim = AdamW(filter(lambda p: p.requires_grad, student.parameters()), lr=1e-4, weight_decay=1e-3)

train_loader, test_loader, teacher, student, optim = accelerator.prepare(
    train_loader, test_loader, teacher, student, optim)

loaders  = {'train': train_loader, 'test': test_loader}

scheduler = get_cosine_schedule_with_warmup(
        optimizer=optim,
        num_cycles=EPOCHS,
        num_warmup_steps=int(0.2 * len(train_loader) * EPOCHS),
        num_training_steps=len(train_loader) * EPOCHS)

In [None]:
from distilTrainer import DistilTrainer

params_trainer = {
    'teacher': teacher,#.to(device),
    'student': student,#.to(device),
    'tokenizer': tokenizer,
    'loaders': loaders,
    'criterion1': nn.CrossEntropyLoss().to(device),
    'criterion2': nn.CrossEntropyLoss().to(device),
    # 'criterion2': nn.KLDivLoss('batchmean').to(device), # mam używać log_target?
    'criterion3': nn.CosineEmbeddingLoss().to(device),
    'optim': optim,
    'scheduler': scheduler,
    'accelerator': accelerator,
    'device': device
}
trainer = DistilTrainer(**params_trainer)

In [None]:
%tensorboard --logdir=exps

In [None]:
params_run = {
    'epoch_start': 0,
    'epoch_end': EPOCHS,
    'exp_name': 'plain_distil_scheduler:cosine_accelerate:bf16',
    'save_interval': 100,
    'temp': 0.5,
    'random_seed': 42
}

trainer.run_exp(**params_run)

In [None]:
teacher.config

In [None]:
student.config