In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from peft import AutoPeftModelForTokenClassification
from datasets import load_dataset
from glob import glob

In [2]:
dataset = load_dataset("clarin-knext/wsd_polish_datasets", trust_remote_code=True)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'phrases', 'wsd'],
        num_rows: 7848
    })
})

In [4]:
dataset['train'].features['tokens'].feature['pos']

Value(dtype='string', id=None)

In [5]:
unique_pos_tags = set()

# Loop through each example in the train dataset
for example in dataset['train']:
    print(example)
    # Each example['tokens'] should be a list of token dictionaries
    for token in example['tokens']:
        # Append the 'pos' field of each token to the set
        # unique_pos_tags.add(token['pos'])
        # print(token['pos'])
        break
    break
# Print the unique POS tags
print(unique_pos_tags)

{'text': 'Zatem to pani siostra o niego prosiła? Skądże, nigdy nie słyszałam, żeby go używała. Zwykłyśmy obywać się bez służących. W istocie, wydaje się niepotrzebnym umieszczanie tak porządnego sznura w tym miejscu. Bądźcie teraz tak uprzejmi i dajcie mi kilka minut na bliższe zapoznanie się z podłogą. Holmes przypadł do podłogi, pełzał w te i wewte ze szkłem powiększającym w dłoni, ze skupieniem badając przerwy między deskami. Następnie w podobny sposób przyjrzał się deskom na ścianach komnaty. Potem podszedł do łóżka i przez jakiś czas przyglądał się uważnie zarówno jemu, jak i przylegającej doń ścianie. Wreszcie chwycił sznur i energicznie go pociągnął. Ależ to przecież atrapa - powiedział. Nie dzwoni? Bynajmniej, nie jest nawet połączony z drucikiem. Niezwykle interesujące. Zwróćcie uwagę, proszę, że jest podpięty do haka tuż nad niewielkim otworem pełniącym funkcję wywietrznika. Toż to niedorzeczne! Nie zauważyłam tego wcześniej. Bardzo dziwne - wymamrotał Holmes, pociągając za s

In [6]:
tokenizer_bert = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model_bert = AutoModelForTokenClassification.from_pretrained("allegro/herbert-base-cased")

tokenizer_gpt = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-medium", add_prefix_space=True)
tokenizer_gpt.pad_token = tokenizer_gpt.eos_token
model_gpt = AutoModelForTokenClassification.from_pretrained("sdadas/polish-gpt2-medium", pad_token_id=tokenizer_gpt.pad_token_id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at sdadas/polish-gpt2-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def preprocess_token_classification(examples):
    # print(examples['tokens'][1]['orth'])
    inputs = []
    for example in examples['tokens']:
        inputs.append(example['orth'])
    tokenized_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128, is_split_into_words=True)
    labels = []
    pos = []
    for example in examples['tokens']:
        pos.append(example['pos'])
    # for i, label in enumerate(pos):
    #     word_ids = tokenized_inputs.word_ids(batch_index=i)
    #     label_ids = [label[word_id] for word_id in word_ids]
    #     labels.append(label_ids)
    tokenized_inputs["labels"] = pos
    return tokenized_inputs

In [8]:
tokenizer=tokenizer_bert

In [9]:
dataset_bert = dataset.map(preprocess_token_classification, batched=True)

In [15]:
tokenizer=tokenizer_gpt

In [19]:
dataset_gpt = dataset.map(preprocess_token_classification, batched=True)

Map:   0%|          | 0/7848 [00:00<?, ? examples/s]

In [21]:
sentence = dataset_bert['train'][0]['text']

In [63]:
def get_embeddings(text, tokenizer, model, layer=-1):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs,output_hidden_states=True)
    # print(outputs)
    # x = outputs.hidden_states[-1][:, 0, :].cpu().detach().numpy().squeeze()
    x = outputs.hidden_states[layer].cpu().detach().numpy().squeeze()  

    return x
    # return outputs.last_hidden_state.squeeze(0).detach()  # Embeddings for each token

# Example for a sentence
sentence = dataset['train'][0]['text']  # Replace with the correct column
bert_embeddings = get_embeddings(sentence, tokenizer_bert, model_bert, layer=-1)
gpt2_embeddings = get_embeddings(sentence, tokenizer_gpt, model_gpt, layer=-1)

In [62]:
from scipy.spatial.distance import cosine
import torch

def measure_anisotropy(embeddings):
    # Compute cosine similarities for pairs of embeddings
    cos_similarities = []
    num_samples = 1000  # Adjust for sampling efficiency

    for _ in range(num_samples):
        # print(embeddings.size)
        idx1, idx2 = torch.randint(0, embeddings.shape[0], (2,))
        emb1 = embeddings[idx1]
        # print(emb1)
        sim = 1 - cosine(embeddings[idx1], embeddings[idx2])
        cos_similarities.append(sim)
    
    return sum(cos_similarities) / len(cos_similarities)

bert_anisotropy = measure_anisotropy(bert_embeddings)
gpt2_anisotropy = measure_anisotropy(gpt2_embeddings)
print("BERT Anisotropy:", bert_anisotropy)
print("GPT-2 Anisotropy:", gpt2_anisotropy)

BERT Anisotropy: 0.7437189999118894
GPT-2 Anisotropy: 0.6160065718214576


In [72]:
num_layers_bert = len(model_bert.bert.encoder.layer)
num_layers_bert

12

In [73]:
num_layers_gpt = len(model_gpt.transformer.h)
num_layers_gpt

24

In [74]:
bert_anisotropies = []
for i in range(num_layers_bert):
    bert_embeddings = get_embeddings(sentence, tokenizer_bert, model_bert, layer=i)
    anisotropy = measure_anisotropy(bert_embeddings)
    bert_anisotropies.append(anisotropy)

gpt_anisotropies = []
for i in range(num_layers_gpt):
    gpt_embeddings = get_embeddings(sentence, tokenizer_gpt, model_gpt, layer=i)
    anisotropy = measure_anisotropy(gpt_embeddings)
    gpt_anisotropies.append(anisotropy)

In [85]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

# Assuming bert_anisotropies and gpt_anisotropies are already populated
# and num_layers_bert and num_layers_gpt are defined

# Create a DataFrame for the plot
data = {
    'Layer': list(range(num_layers_bert)) + list(range(num_layers_gpt)),
    'Anisotropy': bert_anisotropies + gpt_anisotropies,
    'Model': ['BERT'] * num_layers_bert + ['GPT-2'] * num_layers_gpt
}

df = pd.DataFrame(data)

# Create the scatter plot with connected lines using plotly.graph_objects
fig = go.Figure()

# Add BERT trace
fig.add_trace(go.Scatter(x=df[df['Model'] == 'BERT']['Layer'], 
                         y=df[df['Model'] == 'BERT']['Anisotropy'],
                         mode='markers+lines',  # 'markers+lines' to plot both dots and lines
                         name='BERT',
                         line=dict(shape='linear', dash='dot')))  # Connect the dots with a linear line

# Add GPT-2 trace
fig.add_trace(go.Scatter(x=df[df['Model'] == 'GPT-2']['Layer'], 
                         y=df[df['Model'] == 'GPT-2']['Anisotropy'],
                         mode='markers+lines',  # 'markers+lines' to plot both dots and lines
                         name='GPT-2',
                         line=dict(shape='linear', dash='dot')))  # Connect the dots with a linear line

# Add labels and title
fig.update_layout(
    title="Anisotropy Comparison: BERT vs GPT-2",
    xaxis_title="Layer Number",
    yaxis_title="Anisotropy Value",
    legend_title="Model",
)

# Show the plot
fig.show()
