## Imports

In [1]:
from transformers import pipeline
import torch
import numpy as np
import pandas as pd
import seaborn as sns

def nice_df(df, axis=None, reverse=False, **kwargs):
    cm = sns.light_palette("green", as_cmap=True, reverse=reverse)
    return df.style.background_gradient(cmap=cm, axis=axis, **kwargs)

device = torch.device("mps")



## Loading Data

In [4]:
from datasets import concatenate_datasets, load_from_disk
lang_list = ['en', 'fr', 'de', 'es']
data = {
    lang: load_from_disk(f'handle_amazon/amazon_ok_tr_{lang}')
    for lang in lang_list
}

## Defining Models

### Sentiment Pipeline (small, monolingual)

In [5]:
pipeline_classifier = pipeline("sentiment-analysis", truncation='only_first', device=device)
print(pipeline_classifier.model.num_parameters() * 4 / 2**30)


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


0.24942684918642044


### Zeroshot Pipeline (big, multilingual)

In [6]:
zeroshot_classifier = pipeline("zero-shot-classification", truncation='only_first', device=device)
print(zeroshot_classifier.model.num_parameters() * 4 / 2**30)


No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


1.517475139349699


## Running

In [7]:
def run(ds, model, need_labels = True):
    if need_labels:
        res = model(
            ds['review_body'],
            candidate_labels=["NEGATIVE", "POSITIVE"])
    else:
        res = model(ds['review_body'])

    if need_labels:
        pred = [x['labels'][0] == "POSITIVE" for x in res]
    else:
        pred = [x['label'] == "POSITIVE" for x in res]

    pred = torch.tensor(pred).int()
    labels = torch.tensor(ds['bin_label'])
    return torch.mean((pred == labels).float())


In [149]:
zeroshot_res = pd.DataFrame(data = np.zeros((4, 2)), columns = ['sent_test', 'sent_val'], index=['en', 'fr', 'de', 'es'])
for lang in lang_list:
    ds_test = data[lang]['test']
    ds_val = data[lang]['validation']
    test_lres = run(ds_test, zeroshot_classifier, need_labels=True).item()
    val_lres = run(ds_val, zeroshot_classifier, need_labels=True).item()

    zeroshot_res.at[lang, 'sent_test'] = test_lres
    zeroshot_res.at[lang, 'sent_val'] = val_lres
    print(f"{lang} ", test_lres, val_lres)


nice_df(zeroshot_res)



en  0.8962500095367432 0.8972499966621399
fr  0.8090000152587891 0.8212500214576721
de  0.8054999709129333 0.7994999885559082
es  0.8690000176429749 0.8582500219345093


Unnamed: 0,sent_test,sent_val
en,0.89625,0.89725
fr,0.809,0.82125
de,0.8055,0.7995
es,0.869,0.85825


In [133]:
sentiment_res = pd.DataFrame(data = np.zeros((4, 2)), columns = ['sent_test', 'sent_val'], index=['en', 'fr', 'de', 'es'])
for lang in lang_list:
    ds_test = data[lang]['test']
    ds_val = data[lang]['validation']
    test_lres = run(ds_test, pipeline_classifier, need_labels=False).item()
    val_lres = run(ds_val, pipeline_classifier, need_labels=False).item()

    sentiment_res.at[lang, 'sent_test'] = test_lres
    sentiment_res.at[lang, 'sent_val'] = val_lres
    print(f"{lang} ", test_lres, val_lres)

nice_df(sentiment_res)



en  0.8510000109672546 0.846750020980835
fr  0.6292499899864197 0.6187499761581421
de  0.5120000243186951 0.5117499828338623
es  0.6157500147819519 0.6127499938011169


Unnamed: 0,sent_test,sent_val
en,0.851,0.84675
fr,0.62925,0.61875
de,0.512,0.51175
es,0.61575,0.61275


## Rubbish

In [6]:
for lang in lang_list:
    print(data[lang]['train'][0]['attention_mask'].sum(), (data[lang]['train'][0]['attention_mask'] >= 0).sum())
    # len(data['en']['train']['review_body'][0].split()), data['en']['train']['review_body'][0]
    input_ids = data[lang]['train'][0]['input_ids']
    print(len(input_ids))

tensor(137) tensor(512)
512
tensor(46) tensor(400)
400
tensor(18) tensor(448)
448
tensor(34) tensor(368)
368


In [160]:
for lang in lang_list:
    train_dataloader = torch.utils.data.DataLoader(data[lang]['train'], batch_size=8, shuffle=True)

    for batch in train_dataloader:
        if batch['input_ids'].shape[-1] != 512:
            print(lang, batch['input_ids'].shape[-1])
            break
        pass



RuntimeError: stack expects each tensor to be equal size, but got [304] at entry 0 and [384] at entry 1

In [156]:
star_dist = pd.DataFrame(data = np.zeros((4, 5)), columns = np.arange(1, 6), index=['en', 'fr', 'de', 'es'])

for lang in lang_list:
    ds_test = load_from_disk(f'handle_amazon/amazon_{lang}')['test']
    for i in range(1, 6):
        lres = (ds_test['stars'] == i).int().sum().item()
        star_dist.at[lang, i] = lres
nice_df(star_dist) 


Unnamed: 0,1,2,3,4,5
en,1000.0,1000.0,0.0,1000.0,1000.0
fr,1000.0,1000.0,0.0,1000.0,1000.0
de,1000.0,1000.0,0.0,1000.0,1000.0
es,1000.0,1000.0,0.0,1000.0,1000.0


In [3]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id, output_hidden_states=True)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [10]:
X_train_german = [
    "Mit keinem guten Ergebnis",
    "Das war unfair",
    "Das ist gar nicht mal so gut",
    "nicht so schlecht wie erwartet",
    "Das war gut!",
    "Sie fahrt ein grunes Auto",
]
X_train_spain = [
    "Sin buen resultado.",
    "Eso fue injusto.",
    "Eso ni siquiera es tan bueno.",
    "no tan malo como se esperaba.",
    "¡Eso estuvo bueno!",
    "Conduce un coche verde.",
]

with torch.no_grad():
    data = torch.tensor(tokenizer(X_train_german, padding=True, pad_to_multiple_of=32).input_ids)
    print(data.shape)
    print(model(data).hidden_states[-1].mean(dim=1).shape)

with torch.no_grad():
    data = tokenizer(X_train_german[5], padding=True, return_tensors='pt').input_ids
    print(torch.argmax(model(data).logits, axis=1))
    print([x['label'] for x in pipeline_classifier(X_train_german)])

with torch.no_grad():
    print([x['labels'][0] for x in zeroshot_classifier(X_train_german, candidate_labels=["NEGATIVE", "POSITIVE", "NEUTRAL"])])

with torch.no_grad():
    data = tokenizer(X_train_german[5], padding=True, return_tensors='pt').input_ids
    print(torch.argmax(model(data).logits, axis=1))
    print([x['label'] for x in pipeline_classifier(X_train_spain)])

with torch.no_grad():
    print([x['labels'][0] for x in zeroshot_classifier(X_train_spain, candidate_labels=["NEGATIVE", "POSITIVE", "NEUTRAL"])])



torch.Size([6, 32])
torch.Size([6, 768])
tensor([0])


NameError: name 'pipeline_classifier' is not defined