Colab related:

In [85]:
#!g1.1
# from google.colab import drive
# drive.mount('/content/drive')

# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_en" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_fr" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_de" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_es" .

# !pip install transformers datasets


DS related:

In [86]:
#!g1.1
# %pip install seaborn
# %pip install transformers datasets



## Imports

In [94]:
#!g1.1
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm


from torch.utils.data import DataLoader

def nice_df(df, axis=None, reverse=False, **kwargs):
    cm = sns.light_palette("green", as_cmap=True, reverse=reverse)
    return df.style.background_gradient(cmap=cm, axis=axis, **kwargs)

device = torch.device("cuda")



## Loading Data

In [88]:
#!g1.1

# !unzip handle_amazon


In [91]:
#!g1.1
from datasets import concatenate_datasets, load_from_disk

BS = 32
lang_list = ['en', 'fr', 'de', 'es']
split_list = ['train', 'validation', 'test']


# data = {
#     lang: load_from_disk(f'handle_amazon/amazon_{lang}')
#     for lang in lang_list
# }

tr_data = {
    lang: load_from_disk(f'handle_amazon/amazon_ok_tr_{lang}')
    for lang in lang_list
}

dataloader = {
    lang: {
        split: DataLoader(tr_data[lang][split], batch_size=BS, shuffle=(split == 'train'))
        for split in split_list
    }
    for lang in lang_list
}



## Model

In [92]:
#!g1.1
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

models = dict()
for lang in lang_list:
    models[lang] = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id, output_hidden_states=True)
    models[lang].to(device)
    for param in models[lang].base_model.parameters():
        param.requires_grad = False


with torch.no_grad():
    for batch in dataloader['en']['test']:
        i_d = batch["input_ids"].to(device)
        a_m = batch["attention_mask"].to(device)
        batch_hs = models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).hidden_states[-1].mean(dim=1)    
        print(batch_hs)

        logits = models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).logits    
        print(logits)

        print(torch.argmax(
            models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).logits,
            axis=-1
        ))
        break


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

tensor([[ 0.3267,  0.0176,  0.1214,  ...,  0.0836, -0.0200, -0.1360],
        [ 0.1976,  0.1497,  0.1829,  ...,  0.0148,  0.0260, -0.0101],
        [ 0.1492,  0.0464,  0.1645,  ..., -0.0861, -0.0884,  0.0062],
        ...,
        [ 0.1911,  0.1054, -0.0288,  ..., -0.0056,  0.0567, -0.1306],
        [ 0.1369,  0.0744,  0.1030,  ...,  0.0234, -0.1261, -0.0686],
        [ 0.1460,  0.2451,  0.0183,  ...,  0.1160, -0.0157, -0.1809]],
       device='cuda:0')
tensor([[-0.0947, -0.0214],
        [-0.0924, -0.0269],
        [-0.0674,  0.0395],
        [-0.0349, -0.0645],
        [-0.0661, -0.0269],
        [-0.0534, -0.0090],
        [-0.0786, -0.0016],
        [-0.0903, -0.0668],
        [-0.1296, -0.0019],
        [-0.0911, -0.0055],
        [-0.1448, -0.0142],
        [-0.0348, -0.0198],
        [-0.0839, -0.0274],
        [-0.0551, -0.0068],
        [-0.0893,  0.0167],
        [-0.1206, -0.0016],
        [-0.1080,  0.0143],
        [-0.1051, -0.0923],
        [-0.1024, -0.0189],
        [-

In [93]:
#!g1.1
for name, param in models['en'].named_parameters():
    if 'clas' in name:
        print(name, param.shape, param.requires_grad)
    else:
        assert not param.requires_grad


pre_classifier.weight torch.Size([768, 768]) True
pre_classifier.bias torch.Size([768]) True
classifier.weight torch.Size([2, 768]) True
classifier.bias torch.Size([2]) True


## Eval and Training

In [102]:
#!g1.1
def eval(model, dls, lang, test_split):
    # put model in eval mode
    model.eval()

    # get needful data slice
    dl_to_test = dls[lang][test_split]
    
    test_loss = 0
    test_acc = 0
    
    with torch.no_grad():
        for batch in tqdm(dl_to_test):
            # move batch to device
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['bin_label'].to(model.device)

            # forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # calculate loss and accuracy
            preds = logits.argmax(dim=1)
            test_acc += (preds == labels).sum().item()

    test_acc /= BS * len(dl_to_test)
    print(f'\teval {lang}: {test_acc}')
    return test_acc


In [103]:
#!g1.1
def train(model, dls, lang, train_split, validation_split, num_epochs=2, device='mps'):
    # put model on mps device
    model.to(device)
    
    # get needful data slice
    dl_to_train = dls[lang][train_split]
    dl_to_valid = dls[lang][validation_split]

    # define our optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        # train loop
        model.train()
        train_loss = 0
        train_acc = 0
        for batch in tqdm(dl_to_train):
            # move batch to device
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['bin_label'].to(model.device)

            # zero out gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # calculate loss and accuracy
            loss = loss_fn(logits, labels)
            train_loss += loss.item()
            preds = logits.argmax(dim=1)
            train_acc += (preds == labels).sum().item()

            # backward pass
            loss.backward()

            # update weights
            optimizer.step()

        train_acc /= BS * len(dl_to_train)
        valid_acc = eval(model, dls, lang, validation_split)
        print(f'train {lang}: {train_acc} (val {valid_acc})')
    return model



## All Lang Models

In [107]:
#!g1.1
for lang in lang_list:
    train(models[lang], dataloader, lang, 'train', 'validation', num_epochs=3, device=device)


	eval en: 0.861
train en: 0.8488625 (val 0.861)
	eval en: 0.86925
train en: 0.8619125 (val 0.86925)
	eval en: 0.87025
train en: 0.8650625 (val 0.87025)
	eval fr: 0.7455
train fr: 0.6971875 (val 0.7455)
	eval fr: 0.7595
train fr: 0.73629375 (val 0.7595)
	eval fr: 0.76825
train fr: 0.74451875 (val 0.76825)
	eval de: 0.69875
train de: 0.638375 (val 0.69875)
	eval de: 0.713
train de: 0.68404375 (val 0.713)
	eval de: 0.7295
train de: 0.69721875 (val 0.7295)


100%|██████████| 5000/5000 [16:09<00:00,  5.16it/s]
100%|██████████| 125/125 [00:23<00:00,  5.43it/s]
100%|██████████| 5000/5000 [16:09<00:00,  5.16it/s]
100%|██████████| 125/125 [00:22<00:00,  5.44it/s]
100%|██████████| 5000/5000 [16:12<00:00,  5.14it/s]
100%|██████████| 125/125 [00:23<00:00,  5.42it/s]


	eval es: 0.76325
train es: 0.69039375 (val 0.76325)


100%|██████████| 5000/5000 [16:13<00:00,  5.14it/s]
100%|██████████| 125/125 [00:23<00:00,  5.42it/s]


	eval es: 0.77925
train es: 0.74000625 (val 0.77925)


100%|██████████| 5000/5000 [16:04<00:00,  5.18it/s]
100%|██████████| 125/125 [00:22<00:00,  5.48it/s]

	eval es: 0.79075
train es: 0.74936875 (val 0.79075)





In [108]:
#!g1.1

# for lang in lang_list:
#     if lang != 'en':
#         train(models[lang], dataloader, lang, 'train', 'validation', lang=lang, num_epochs=3, device=device)


In [109]:
#!g1.1
eval_res = pd.DataFrame(data = np.zeros((4, 1)), columns = ['finetune'], index=lang_list)

for lang in lang_list:
    test_res = eval(models[lang], dataloader, lang, 'test')
    eval_res.at[lang, 'finetune'] = test_res

nice_df(eval_res)


100%|██████████| 125/125 [00:22<00:00,  5.47it/s]


	eval en: 0.874


100%|██████████| 125/125 [00:22<00:00,  5.47it/s]


	eval fr: 0.76225


100%|██████████| 125/125 [00:22<00:00,  5.46it/s]


	eval de: 0.73575


100%|██████████| 125/125 [00:22<00:00,  5.47it/s]

	eval es: 0.793





Unnamed: 0,finetune
en,0.874
fr,0.76225
de,0.73575
es,0.793


In [78]:
#!g1.1
for lang in lang_list:
    models[lang].save_pretrained(f'models/cp1_{lang}')



In [79]:
#!g1.1
for lang in lang_list:
    print(len(dataloader[lang]['train']))


5000
125
125
125


In [None]:
#!g1.1
