In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from peft import AutoPeftModelForTokenClassification
from datasets import load_dataset
from glob import glob

In [2]:
dataset = load_dataset("clarin-knext/wsd_polish_datasets", trust_remote_code=True)

README.md:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

wsd_polish_datasets.py:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

sherlock_text.jsonl:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

skladnica_text.jsonl:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

wikiglex_text.jsonl:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

emoglex_text.jsonl:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

walenty_text.jsonl:   0%|          | 0.00/50.6M [00:00<?, ?B/s]

kpwr_text.jsonl:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

kpwr-100_text.jsonl:   0%|          | 0.00/8.02M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'phrases', 'wsd'],
        num_rows: 7848
    })
})

In [4]:
dataset['train'].features['tokens'].feature['pos']

Value(dtype='string', id=None)

In [7]:
tokenizer_bert = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model_bert = AutoModelForTokenClassification.from_pretrained("allegro/herbert-base-cased")

tokenizer_gpt = AutoTokenizer.from_pretrained("sdadas/polish-gpt2-medium", add_prefix_space=True)
tokenizer_gpt.pad_token = tokenizer_gpt.eos_token
model_gpt = AutoModelForTokenClassification.from_pretrained("sdadas/polish-gpt2-medium", pad_token_id=tokenizer_gpt.pad_token_id)

pytorch_model.bin:  64%|######4   | 419M/654M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at sdadas/polish-gpt2-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
sentence = dataset['train'][0]['text']

In [9]:
def get_embeddings(text, tokenizer, model, layer=-1):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs,output_hidden_states=True)
    # print(outputs)
    # x = outputs.hidden_states[-1][:, 0, :].cpu().detach().numpy().squeeze()
    x = outputs.hidden_states[layer].cpu().detach().numpy().squeeze()  

    return x
    # return outputs.last_hidden_state.squeeze(0).detach()  # Embeddings for each token

# Example for a sentence
sentence = dataset['train'][0]['text']  # Replace with the correct column
bert_embeddings = get_embeddings(sentence, tokenizer_bert, model_bert, layer=-1)
gpt2_embeddings = get_embeddings(sentence, tokenizer_gpt, model_gpt, layer=-1)

In [10]:
from scipy.spatial.distance import cosine
import torch

def measure_anisotropy(embeddings):
    # Compute cosine similarities for pairs of embeddings
    cos_similarities = []
    num_samples = 1000  # Adjust for sampling efficiency

    for _ in range(num_samples):
        # print(embeddings.size)
        idx1, idx2 = torch.randint(0, embeddings.shape[0], (2,))
        emb1 = embeddings[idx1]
        # print(emb1)
        sim = 1 - cosine(embeddings[idx1], embeddings[idx2])
        cos_similarities.append(sim)
    
    return sum(cos_similarities) / len(cos_similarities)

bert_anisotropy = measure_anisotropy(bert_embeddings)
gpt2_anisotropy = measure_anisotropy(gpt2_embeddings)
print("BERT Anisotropy:", bert_anisotropy)
print("GPT-2 Anisotropy:", gpt2_anisotropy)

BERT Anisotropy: 0.7217383639671769
GPT-2 Anisotropy: 0.24481897037399522


In [11]:
num_layers_bert = len(model_bert.bert.encoder.layer)
num_layers_bert

12

In [12]:
num_layers_gpt = len(model_gpt.transformer.h)
num_layers_gpt

24

In [13]:
bert_anisotropies = []
for i in range(num_layers_bert):
    bert_embeddings = get_embeddings(sentence, tokenizer_bert, model_bert, layer=i)
    anisotropy = measure_anisotropy(bert_embeddings)
    bert_anisotropies.append(anisotropy)

gpt_anisotropies = []
for i in range(num_layers_gpt):
    gpt_embeddings = get_embeddings(sentence, tokenizer_gpt, model_gpt, layer=i)
    anisotropy = measure_anisotropy(gpt_embeddings)
    gpt_anisotropies.append(anisotropy)

In [14]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

data = {
    'Layer': list(range(num_layers_bert)) + list(range(num_layers_gpt)),
    'Anisotropy': bert_anisotropies + gpt_anisotropies,
    'Model': ['BERT'] * num_layers_bert + ['GPT-2'] * num_layers_gpt
}

df = pd.DataFrame(data)

fig = go.Figure()

fig.add_trace(go.Scatter(x=df[df['Model'] == 'BERT']['Layer'], 
                         y=df[df['Model'] == 'BERT']['Anisotropy'],
                         mode='markers+lines',
                         name='BERT',
                         line=dict(shape='linear', dash='dot')))

fig.add_trace(go.Scatter(x=df[df['Model'] == 'GPT-2']['Layer'], 
                         y=df[df['Model'] == 'GPT-2']['Anisotropy'],
                         mode='markers+lines',
                         name='GPT-2',
                         line=dict(shape='linear', dash='dot')))


fig.update_layout(
    title="Anisotropy Comparison: BERT vs GPT-2",
    xaxis_title="Layer Number",
    yaxis_title="Anisotropy Value",
    legend_title="Model",
)

fig.show()


In [15]:
sentences = dataset['train'][:10]['text']  # Replace with the correct column

In [16]:
joined = " ".join(sentences)

In [17]:
def context_specificity(token, dataset, tokenizer, model, layer=-1):
    embeddings = []
    texts = dataset['text']
    for example in texts:
        try:
            inputs = tokenizer(example, return_tensors="pt")
            outputs = model(**inputs, output_hidden_states=True).hidden_states[layer].squeeze(0).detach()
        except:
            continue

        # Get index of token in the text
        token_id = tokenizer.convert_tokens_to_ids(token)
        if token_id in inputs['input_ids']:
            token_index = (inputs['input_ids'] == token_id).nonzero(as_tuple=True)[1]
            embeddings.append(outputs[token_index].mean(0))  # Averaging over token occurrences

    # Compute average cosine similarity between each pair of embeddings
    cos_similarities = []
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            sim = 1 - cosine(embeddings[i], embeddings[j])
            cos_similarities.append(sim)

    return sum(cos_similarities) / len(cos_similarities) if cos_similarities else None

# Example usage
bert_context_specificity = context_specificity("nie", dataset['train'][:100], tokenizer_bert, model_bert)
gpt2_context_specificity = context_specificity("nie", dataset['train'][:100], tokenizer_gpt, model_gpt)

print("BERT Context-Specificity:", bert_context_specificity)
print("GPT-2 Context-Specificity:", gpt2_context_specificity)


Token indices sequence length is longer than the specified maximum sequence length for this model (1781 > 512). Running this sequence through the model will result in indexing errors


BERT Context-Specificity: 0.8382445823264426
GPT-2 Context-Specificity: 0.24329707611341647


In [18]:
bert_context = []
for i in range(num_layers_bert):
    context = context_specificity("nie", dataset['train'][:100], tokenizer_bert, model_bert, layer=i)
    bert_context.append(context)

gpt_context = []
for i in range(num_layers_gpt):
    context = context_specificity("nie", dataset['train'][:100], tokenizer_gpt, model_gpt, layer=i)
    gpt_context.append(context)

In [None]:
data = {
    'Layer': list(range(num_layers_bert)) + list(range(num_layers_gpt)),
    'Context': bert_context + gpt_context,
    'Model': ['BERT'] * num_layers_bert + ['GPT-2'] * num_layers_gpt
}

df = pd.DataFrame(data)

fig = go.Figure()

fig.add_trace(go.Scatter(x=df[df['Model'] == 'BERT']['Layer'], 
                         y=df[df['Model'] == 'BERT']['Context'],
                         mode='markers+lines',
                         name='BERT',
                         line=dict(shape='linear', dash='dot')))

fig.add_trace(go.Scatter(x=df[df['Model'] == 'GPT-2']['Layer'], 
                         y=df[df['Model'] == 'GPT-2']['Context'],
                         mode='markers+lines',
                         name='GPT-2',
                         line=dict(shape='linear', dash='dot')))


fig.update_layout(
    title="Context-Specificity Comparison: BERT vs GPT-2",
    xaxis_title="Layer Index",
    yaxis_title="Context-Specificity Value",
    legend_title="Model",
)

fig.show()

# Parameter Projection
