Colab related:

In [111]:
#!g1.1
# from google.colab import drive
# drive.mount('/content/drive')

# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_en" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_fr" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_de" .
# !cp -r "/content/drive/MyDrive/Colab Notebooks/Diploma/handle_amazon/amazon_es" .

# !pip install transformers datasets


DS related:

In [112]:
#!g1.1
# %pip install seaborn
# %pip install transformers datasets



## Imports

In [113]:
#!g1.1
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm


from torch.utils.data import DataLoader

def nice_df(df, axis=None, reverse=False, **kwargs):
    cm = sns.light_palette("green", as_cmap=True, reverse=reverse)
    return df.style.background_gradient(cmap=cm, axis=axis, **kwargs)

device = torch.device("cuda")



## Loading Data

In [114]:
#!g1.1

# !unzip handle_amazon


In [115]:
#!g1.1
from datasets import concatenate_datasets, load_from_disk

BS = 32
lang_list = ['en', 'fr', 'de', 'es']
split_list = ['train', 'validation', 'test']


# data = {
#     lang: load_from_disk(f'handle_amazon/amazon_{lang}')
#     for lang in lang_list
# }

tr_data = {
    lang: load_from_disk(f'handle_amazon/amazon_ok_tr_{lang}')
    for lang in lang_list
}

dataloader = {
    lang: {
        split: DataLoader(tr_data[lang][split], batch_size=BS, shuffle=(split == 'train'))
        for split in split_list
    }
    for lang in lang_list
}



## Model

In [141]:
#!g1.1
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

models = dict()
for lang in lang_list:
    models[lang] = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id, output_hidden_states=True)
    models[lang].to(device)
    for param in models[lang].base_model.parameters():
        param.requires_grad = False


with torch.no_grad():
    for batch in dataloader['en']['test']:
        i_d = batch["input_ids"].to(device)
        a_m = batch["attention_mask"].to(device)
        batch_hs = models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).hidden_states[-1].mean(dim=1)    
        print(batch_hs)

        logits = models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).logits    
        print(logits)

        print(torch.argmax(
            models['en'](
                input_ids=i_d,
                attention_mask=a_m,
            ).logits,
            axis=-1
        ))
        break


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

tensor([[ 0.3267,  0.0176,  0.1214,  ...,  0.0836, -0.0200, -0.1360],
        [ 0.1976,  0.1497,  0.1829,  ...,  0.0148,  0.0260, -0.0101],
        [ 0.1492,  0.0464,  0.1645,  ..., -0.0861, -0.0884,  0.0062],
        ...,
        [ 0.1911,  0.1054, -0.0288,  ..., -0.0056,  0.0567, -0.1306],
        [ 0.1369,  0.0744,  0.1030,  ...,  0.0234, -0.1261, -0.0686],
        [ 0.1460,  0.2451,  0.0183,  ...,  0.1160, -0.0157, -0.1809]],
       device='cuda:0')
tensor([[-0.0517,  0.1344],
        [-0.0739,  0.1222],
        [-0.1158,  0.1851],
        [-0.0793,  0.0786],
        [-0.0583,  0.1682],
        [-0.0760,  0.1323],
        [-0.1093,  0.1419],
        [-0.0584,  0.1381],
        [-0.0957,  0.1648],
        [-0.1241,  0.1488],
        [-0.0860,  0.1974],
        [-0.0562,  0.1290],
        [-0.0872,  0.1571],
        [-0.1066,  0.1114],
        [-0.1126,  0.1718],
        [-0.1448,  0.1664],
        [-0.1365,  0.1943],
        [ 0.0068,  0.1577],
        [-0.0718,  0.1091],
        [-

In [146]:
#!g1.1

from collections import Counter
import string
from transformers import pipeline

translators = {
    lang: pipeline("translation", model=f"Helsinki-NLP/opus-mt-{lang}-en", max_length=6)
    for lang in ['fr', 'de', 'es']
}


In [147]:
#!g1.1

def get_top_words(lang_data, K=1000):
    # Extract the 'review_body' column from the dataset
    reviews = lang_data['train']['review_body']

    # Concatenate all the reviews into a single string
    all_reviews = ' '.join(reviews)
    all_reviews = all_reviews.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

    # Tokenize the concatenated string into individual tokens
    tokens = all_reviews.split()
    tokens = [token.lower() for token in tokens]

    # Calculate the frequency of each token
    token_freq = Counter(tokens)

    # Sort the tokens based on their frequency in descending order
    sorted_tokens = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)

    # Get the top 1000 most frequent tokens
    top_K_tokens = [token for token, freq in sorted_tokens[:K]]
    return top_K_tokens

def get_translation_pairs(lang, K=1000):
    top_words = get_top_words(tr_data[lang], K=K)
    top_translated = [x['translation_text'] for x in translators[lang](top_words)]

    normalize = lambda x: x.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))).lower().strip()
    normalize2 = lambda x: x if len(x) > 0 else 'XXX'
    top_translated = [normalize(x) for x in top_translated]
    top_translated = [normalize2(x) for x in top_translated]

    pairs = list(zip(top_words, top_translated))
    return pairs


def warm_up_model(model, tokenizer, lang, K):
    pairs = get_translation_pairs(lang, K)

    for param in model.base_model.parameters():
        param.requires_grad = False
    word_embeddings = model.base_model.embeddings.word_embeddings.weight

    new_embed_list = list()
    for word, translation in pairs[::-1]:
        list_of_tr = [word.lower() for word in translation.split()]
        ids = [tokenizer.convert_tokens_to_ids(tr) for tr in list_of_tr]
#         print(word, translation, ids, sep=' : ')
        embeds = [word_embeddings[id] for id in ids]
        if len(embeds) == 0:
            print(word, translation)
        new_embed = torch.zeros_like(embeds[0])
        for embed in embeds:
            new_embed += embed
        new_embed /= len(embeds)
        new_embed_list.append((word, new_embed))

    for word, new_embed in new_embed_list:
        token_id = tokenizer.convert_tokens_to_ids(word)
        word_embeddings[token_id] = new_embed

    for param in model.base_model.parameters():
        param.requires_grad = True



In [155]:
#!g1.1

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

for lang in ['fr', 'es', 'de']:
    warm_up_model(models[lang], tokenizer, lang, 5000)



Your input_length: 6 is bigger than 0.9 * max_length: 6. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 6 is bigger than 0.9 * max_length: 6. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 6 is bigger than 0.9 * max_length: 6. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 6 is bigger than 0.9 * max_length: 6. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 6 is bigger than 0.9 * max_length: 6. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 6 is bigger than 0.9 * max_length: 6. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 6 is bigger than 0.9 * max_length: 6. You might consider increasing y

In [156]:
#!g1.1

for lang in lang_list:
    for param in models[lang].base_model.parameters():
        param.requires_grad = False
    

In [157]:
#!g1.1

for lang in lang_list:
    for name, param in models[lang].named_parameters():
        if 'clas' in name:
            print(name, param.shape, param.requires_grad)
        else:
#             print(name)
            assert not param.requires_grad
    print(f'{lang} is ok')


pre_classifier.weight torch.Size([768, 768]) True
pre_classifier.bias torch.Size([768]) True
classifier.weight torch.Size([2, 768]) True
classifier.bias torch.Size([2]) True
en is ok
pre_classifier.weight torch.Size([768, 768]) True
pre_classifier.bias torch.Size([768]) True
classifier.weight torch.Size([2, 768]) True
classifier.bias torch.Size([2]) True
fr is ok
pre_classifier.weight torch.Size([768, 768]) True
pre_classifier.bias torch.Size([768]) True
classifier.weight torch.Size([2, 768]) True
classifier.bias torch.Size([2]) True
de is ok
pre_classifier.weight torch.Size([768, 768]) True
pre_classifier.bias torch.Size([768]) True
classifier.weight torch.Size([2, 768]) True
classifier.bias torch.Size([2]) True
es is ok


## Eval and Training

In [158]:
#!g1.1
def eval(model, dls, lang, test_split):
    # put model in eval mode
    model.eval()

    # get needful data slice
    dl_to_test = dls[lang][test_split]
    
    test_loss = 0
    test_acc = 0
    
    with torch.no_grad():
        for batch in tqdm(dl_to_test):
            # move batch to device
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['bin_label'].to(model.device)

            # forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # calculate loss and accuracy
            preds = logits.argmax(dim=1)
            test_acc += (preds == labels).sum().item()

    test_acc /= BS * len(dl_to_test)
    print(f'\teval {lang}: {test_acc}')
    return test_acc


In [159]:
#!g1.1
def train(model, dls, lang, train_split, validation_split, num_epochs=2, device='mps'):
    # put model on mps device
    model.to(device)
    
    # get needful data slice
    dl_to_train = dls[lang][train_split]
    dl_to_valid = dls[lang][validation_split]

    # define our optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        # train loop
        model.train()
        train_loss = 0
        train_acc = 0
        for batch in tqdm(dl_to_train):
            # move batch to device
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['bin_label'].to(model.device)

            # zero out gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # calculate loss and accuracy
            loss = loss_fn(logits, labels)
            train_loss += loss.item()
            preds = logits.argmax(dim=1)
            train_acc += (preds == labels).sum().item()

            # backward pass
            loss.backward()

            # update weights
            optimizer.step()

        train_acc /= BS * len(dl_to_train)
        valid_acc = eval(model, dls, lang, validation_split)
        print(f'train {lang}: {train_acc} (val {valid_acc})')
    return model



## All Lang Models

In [163]:
#!g1.1
for lang in lang_list:
    train(models[lang], dataloader, lang, 'train', 'validation', num_epochs=2, device=device)


	eval en: 0.8735
train en: 0.867075 (val 0.8735)
	eval en: 0.87425
train en: 0.86969375 (val 0.87425)
	eval fr: 0.795
train fr: 0.78015 (val 0.795)
	eval fr: 0.796
train fr: 0.78205 (val 0.796)
	eval de: 0.74725
train de: 0.72305 (val 0.74725)
	eval de: 0.751
train de: 0.72688125 (val 0.751)
	eval es: 0.7905
train es: 0.77488125 (val 0.7905)
	eval es: 0.7915
train es: 0.77685625 (val 0.7915)


100%|██████████| 5000/5000 [16:18<00:00,  5.11it/s]
100%|██████████| 125/125 [00:23<00:00,  5.40it/s]
100%|██████████| 5000/5000 [16:19<00:00,  5.11it/s]
100%|██████████| 125/125 [00:23<00:00,  5.39it/s]


In [164]:
#!g1.1

# for lang in lang_list:
#     if lang != 'en':
#         train(models[lang], dataloader, lang, 'train', 'validation', lang=lang, num_epochs=3, device=device)


In [165]:
#!g1.1
eval_res = pd.DataFrame(data = np.zeros((4, 1)), columns = ['finetune'], index=lang_list)

for lang in lang_list:
    test_res = eval(models[lang], dataloader, lang, 'test')
    eval_res.at[lang, 'finetune'] = test_res

nice_df(eval_res)


100%|██████████| 125/125 [00:23<00:00,  5.39it/s]
100%|██████████| 125/125 [00:23<00:00,  5.39it/s]
100%|██████████| 125/125 [00:23<00:00,  5.39it/s]
100%|██████████| 125/125 [00:23<00:00,  5.39it/s]


	eval en: 0.87675
	eval fr: 0.79475
	eval de: 0.7565
	eval es: 0.79475


Unnamed: 0,finetune
en,0.87675
fr,0.79475
de,0.7565
es,0.79475


In [166]:
#!g1.1
for lang in lang_list:
    models[lang].save_pretrained(f'models/ft_ht_{lang}')



In [79]:
#!g1.1
for lang in lang_list:
    print(len(dataloader[lang]['train']))


5000
125
125
125


In [None]:
#!g1.1
