In [1]:
import pandas as pd

### WiC dataset

https://pilehvar.github.io/wic/

In [2]:
traindf = pd.read_csv("WiC_dataset/train/train.data.txt",sep="\t",header=None,names=["word","pos","index","sent1","sent2"])

In [3]:
labels = open("WiC_dataset/train/train.gold.txt","r").read().split()

In [4]:
traindf["labels"] = labels

- Label F: the word has different senses in sent1 and sent2
- Label T: the word has the same sense

In [5]:
traindf.head()

Unnamed: 0,word,pos,index,sent1,sent2,labels
0,carry,V,2-1,You must carry your camping gear .,Sound carries well over water .,F
1,go,V,2-6,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,F
2,break,V,0-2,Break an alibi .,The wholesaler broke the container loads into ...,F
3,cup,N,8-4,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,T
4,academy,N,1-2,The Academy of Music .,The French Academy .,F


### Get Contextualised Embeddings with BERT

Extrahiert mit dem BERT-Modell die Embeddings für die Zielwörter in den Sätzen und berechnet die cosine similarity für die Embeddings der Zielwörter in einem Satzpaar. Tragt die similarity als zusätzliche Spalte in den Pandas-Dataframe ein. Berechnet die durchschnittliche Ähnlichkeit für Satzpaare mit dem Label F/T.

In [6]:
from transformers import BertTokenizer, BertModel

# Load the pre-trained BERT model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# The target words are in the 'word' column
# The sentences are in the 'sent1' and 'sent2' columns
# the index is encoded in the 'index' column as {index in sent1}-{index in sent2}

import torch
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(sent1, sent2, index, target_word): 
    # Tokenize the sentences
    sent1_tokens = tokenizer(sent1, truncation=True, padding=True, return_tensors='pt')
    sent2_tokens = tokenizer(sent2, truncation=True, padding=True, return_tensors='pt')

    # Encode the sentences using the BERT model
    with torch.no_grad():
        sent1_outputs = model(**sent1_tokens)
        sent2_outputs = model(**sent2_tokens)
    
    #convert index to a list of two integers
    index = index.split("-")
        
    # get the index of the target word in the tokenized sentence
    target_word_index_sent1 = int(index[0])
    target_word_index_sent2 = int(index[1])

    # Get the embeddings for the target words
    target_word_embeddings_sent1 = sent1_outputs.last_hidden_state[:, target_word_index_sent1, :]
    target_word_embeddings_sent2 = sent2_outputs.last_hidden_state[:, target_word_index_sent2, :]

    # Calculate the cosine similarity
    similarity = cosine_similarity(target_word_embeddings_sent1, target_word_embeddings_sent2)
    
    return similarity

In [20]:
# Add a column to the dataframe with the similarity scores
traindf['similarity'] = traindf.apply(lambda x: compute_similarity(x['sent1'], x['sent2'], x['index'], x['word']), axis=1)

In [21]:
traindf

Unnamed: 0,word,pos,index,sent1,sent2,labels,similarity
0,carry,V,2-1,You must carry your camping gear .,Sound carries well over water .,F,[[0.34953433]]
1,go,V,2-6,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?,F,[[0.2927307]]
2,break,V,0-2,Break an alibi .,The wholesaler broke the container loads into ...,F,[[0.12359998]]
3,cup,N,8-4,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .,T,[[0.41982773]]
4,academy,N,1-2,The Academy of Music .,The French Academy .,F,[[0.32885548]]
...,...,...,...,...,...,...,...
5423,krona,N,4-8,Piecas kronas — five krona .,Kronas kurss — the exchange rate of the krona .,T,[[0.45259044]]
5424,conflict,N,3-1,The harder the conflict the more glorious the ...,The conflict between the government and the re...,T,[[0.36872223]]
5425,answer,V,0-0,Answer the riddle .,Answer a question .,T,[[0.9479373]]
5426,play,V,0-0,Play the casinos in Trouville .,Play the races .,T,[[0.9039677]]


In [23]:
filtered_df_T = traindf[traindf['labels'] == 'T']
average_similarity_T = filtered_df_T['similarity'].mean()

filtered_df_F = traindf[traindf['labels'] == 'F']
average_similarity_F = filtered_df_F['similarity'].mean()

print(f"Average similarity for true labels: {average_similarity_T}")
print(f"Average similarity for false labels: {average_similarity_F}")


Average similarity for true labels: [[0.42886347]]
Average similarity for false labels: [[0.3418881]]
