Colab related:

In [127]:
#!g1.1
# from google.colab import drive
# drive.mount('/content/drive')

# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_en" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_fr" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_de" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_es" .

# !pip install transformers datasets


DS related:

In [128]:
#!g1.1
# %pip install seaborn
# %pip install transformers datasets



## Imports

In [129]:
#!g1.1
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm


from torch.utils.data import DataLoader

def nice_df(df, axis=None, reverse=False, **kwargs):
    cm = sns.light_palette("green", as_cmap=True, reverse=reverse)
    return df.style.background_gradient(cmap=cm, axis=axis, **kwargs)

device = torch.device("cuda")



## Loading Data

In [130]:
#!g1.1

# !unzip handle_amazon


In [131]:
#!g1.1
from datasets import concatenate_datasets, load_from_disk

BS = 32
lang_list = ['en', 'fr', 'de', 'es']
split_list = ['train', 'validation', 'test']


# data = {
#     lang: load_from_disk(f'handle_amazon/amazon_{lang}')
#     for lang in lang_list
# }

tr_data = {
    lang: load_from_disk(f'handle_amazon/amazon_ok_tr_{lang}')
    for lang in lang_list
}

dataloader = {
    lang: {
        split: DataLoader(tr_data[lang][split], batch_size=BS, shuffle=(split == 'train'))
        for split in split_list
    }
    for lang in lang_list
}



## Model

In [133]:
#!g1.1
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

models = dict()
for lang in lang_list:
    models[lang] = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id, output_hidden_states=True)
    models[lang].to(device)
#     for param in models[lang].base_model.parameters():
#         param.requires_grad = False


with torch.no_grad():
    for batch in dataloader['en']['test']:
        i_d = batch["input_ids"].to(device)
        a_m = batch["attention_mask"].to(device)
        batch_hs = models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).hidden_states[-1].mean(dim=1)    
        print(batch_hs)

        logits = models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).logits    
        print(logits)

        print(torch.argmax(
            models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).logits,
            axis=-1
        ))
        break


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

tensor([[ 0.3267,  0.0176,  0.1214,  ...,  0.0836, -0.0200, -0.1360],
        [ 0.1976,  0.1497,  0.1829,  ...,  0.0148,  0.0260, -0.0101],
        [ 0.1492,  0.0464,  0.1645,  ..., -0.0861, -0.0884,  0.0062],
        ...,
        [ 0.1911,  0.1054, -0.0288,  ..., -0.0056,  0.0567, -0.1306],
        [ 0.1369,  0.0744,  0.1030,  ...,  0.0234, -0.1261, -0.0686],
        [ 0.1460,  0.2451,  0.0183,  ...,  0.1160, -0.0157, -0.1809]],
       device='cuda:0')
tensor([[ 0.0882, -0.0922],
        [ 0.0982, -0.0514],
        [ 0.0579, -0.0992],
        [ 0.0792, -0.0381],
        [ 0.0556, -0.1030],
        [ 0.0936, -0.0238],
        [ 0.0797, -0.0974],
        [ 0.1063, -0.0777],
        [ 0.0753, -0.0461],
        [ 0.0592, -0.0841],
        [ 0.1030, -0.0897],
        [ 0.1133, -0.0407],
        [ 0.1020, -0.0723],
        [ 0.1054, -0.0458],
        [ 0.0810, -0.0876],
        [ 0.0785, -0.0557],
        [ 0.0407, -0.0787],
        [ 0.1149, -0.0726],
        [ 0.0706, -0.0392],
        [ 

In [134]:
#!g1.1
for name, param in models['en'].named_parameters():
    if 'clas' in name:
        print(name, param.shape, param.requires_grad)
        assert param.requires_grad
    else:
        assert param.requires_grad


pre_classifier.weight torch.Size([768, 768]) True
pre_classifier.bias torch.Size([768]) True
classifier.weight torch.Size([2, 768]) True
classifier.bias torch.Size([2]) True


## Eval and Training

In [135]:
#!g1.1
def eval(model, dls, lang, test_split):
    # put model in eval mode
    model.eval()

    # get needful data slice
    dl_to_test = dls[lang][test_split]
    
    test_loss = 0
    test_acc = 0
    
    with torch.no_grad():
        for batch in tqdm(dl_to_test):
            # move batch to device
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['bin_label'].to(model.device)

            # forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # calculate loss and accuracy
            preds = logits.argmax(dim=1)
            test_acc += (preds == labels).sum().item()

    test_acc /= BS * len(dl_to_test)
    print(f'\teval {lang}: {test_acc}')
    return test_acc


In [136]:
#!g1.1
def train(model, dls, lang, train_split, validation_split, num_epochs=2, device='mps'):
    # put model on mps device
    model.to(device)
    
    # get needful data slice
    dl_to_train = dls[lang][train_split]
    dl_to_valid = dls[lang][validation_split]

    # define our optimizer and loss function
    learning_rate_bert = 1e-7
    learning_rate_classifier = 2e-5
    optimizer_grouped_parameters = [
        {"params": model.distilbert.parameters(), "lr": learning_rate_bert},
        {"params": model.classifier.parameters(), "lr": learning_rate_classifier},
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        # train loop
        model.train()
        train_loss = 0
        train_acc = 0
        for batch in tqdm(dl_to_train):
            # move batch to device
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['bin_label'].to(model.device)

            # zero out gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # calculate loss and accuracy
            loss = loss_fn(logits, labels)
            train_loss += loss.item()
            preds = logits.argmax(dim=1)
            train_acc += (preds == labels).sum().item()

            # backward pass
            loss.backward()

            # update weights
            optimizer.step()

        train_acc /= BS * len(dl_to_train)
        valid_acc = eval(model, dls, lang, validation_split)
        print(f'train {lang}: {train_acc} (val {valid_acc})')
    return model



## All Lang Models

In [137]:
#!g1.1
for lang in lang_list:
    train(models[lang], dataloader, lang, 'train', 'validation', num_epochs=2, device=device)


	eval en: 0.8765
train en: 0.829375 (val 0.8765)
	eval en: 0.88625
train en: 0.88894375 (val 0.88625)
	eval fr: 0.7655
train fr: 0.67853125 (val 0.7655)
	eval fr: 0.81225
train fr: 0.782325 (val 0.81225)
	eval de: 0.73175
train de: 0.6233375 (val 0.73175)
	eval de: 0.776
train de: 0.7256875 (val 0.776)
	eval es: 0.75325
train es: 0.6439375 (val 0.75325)
	eval es: 0.78875
train es: 0.74749375 (val 0.78875)


100%|██████████| 5000/5000 [42:33<00:00,  1.96it/s]
100%|██████████| 125/125 [00:22<00:00,  5.52it/s]
100%|██████████| 5000/5000 [42:33<00:00,  1.96it/s]
100%|██████████| 125/125 [00:22<00:00,  5.50it/s]


In [138]:
#!g1.1

# for lang in lang_list:
#     if lang != 'en':
#         train(models[lang], dataloader, lang, 'train', 'validation', lang=lang, num_epochs=3, device=device)


In [139]:
#!g1.1
eval_res = pd.DataFrame(data = np.zeros((4, 1)), columns = ['no_freeze'], index=lang_list)

for lang in lang_list:
    test_res = eval(models[lang], dataloader, lang, 'test')
    eval_res.at[lang, 'no_freeze'] = test_res

nice_df(eval_res)


100%|██████████| 125/125 [00:22<00:00,  5.50it/s]
100%|██████████| 125/125 [00:22<00:00,  5.51it/s]
100%|██████████| 125/125 [00:22<00:00,  5.50it/s]
100%|██████████| 125/125 [00:22<00:00,  5.51it/s]


	eval en: 0.901
	eval fr: 0.813
	eval de: 0.775
	eval es: 0.799


Unnamed: 0,no_freeze
en,0.901
fr,0.813
de,0.775
es,0.799


In [141]:
del_datasphere_variables('id2label', 'label2id', 'models', 'param')


In [142]:
models['en'].save_pretrained(f'models/ft_full_en')


NameError: name 'models' is not defined

In [110]:
# #!g1.1
# for lang in lang_list:
#     models[lang].save_pretrained(f'models/ft_no_tr_{lang}')



In [110]:
#!g1.1
for lang in lang_list:
    print(len(dataloader[lang]['train']))


5000
5000
5000
5000


In [None]:
#!g1.1
