In [1]:
#tools

import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

In [2]:
from google.colab import files
uploaded = files.upload()

Saving pronouns_full_set.csv to pronouns_full_set.csv


In [3]:
df = pd.read_csv('pronouns_full_set.csv', sep='\t', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,22533,14,SP,disagreement,,"like i say , the cargo deck 's the best place ...",body,أحدكم,някой,neko,...,ktoś,alguém,cineva,кто-то,nekdo,neko,nån,biri,có ai,
1,780,9,DN,autoDN,,on the ground ! stay on the ground ! nobody ma...,body,يتحركن,никой,ni,...,,ninguém,nimeni,никому,nihče,нико,ingen,kimse kımıldamasın,quậy,谁也
2,4850,0,QU,agreement,,anybody else want to negotiate ? you ?,body,أيريد,някой,neko,...,któryś,alguém,cineva,кто-нибудь,kdo,neko,någon,isteyen,ai,有人
3,994,4,CD,agreement,,"you know , if anybody else had said that to me...",body,أحداً,,tko,...,kto,alguém,mi-ar,,omenil kdo,ko,någon,başkası,người,他人
4,1287,0,FC,disagreement,,anybody here ?,body,أحد,някой,koga,...,kto,alguém,cineva,живые,kdo,кога,någon,kimse,ai,有人


In [4]:
# sentences = df[5]
# english_pronoun = df[14]

In [5]:
#creating dict separated by pronoun

pronoun_dict = {}

for i,row in df.iterrows():
  sentence = row[5]
  pronoun = row[14]

  if pd.isna(pronoun) == True:
    continue

  if pronoun in pronoun_dict:
    pronoun_dict[pronoun].append(sentence)
  else:
    pronoun_dict[pronoun] = [sentence]

In [6]:
#check it was added to dictionary correctly, filter out nothin' and somethin' (3 total sentences)
#also filtering so that there is only one of each some-, any-, and none- pronoun,
#seeing as there is no distinction presented by Haspelmath between someone and something, for example.

pronoun_dict = {key: value for key, value in pronoun_dict.items() if key not in ["nothin", "somethin"]}
pronoun_dict = {key: value for key, value in pronoun_dict.items() if key not in ["anyone", "someone", "something","nothing", "anything"]}

print(pronoun_dict.keys())

dict_keys(['somebody', 'nobody', 'anybody'])


In [7]:
#set up model and pretrained tokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [8]:
#get embeddings per pronoun

def pronoun_embedding(sentence,pronoun,tokenizer,model):
  tokenized_sentence = tokenizer(sentence, return_tensors='pt')

  with torch.no_grad():
    outputs = model(**tokenized_sentence)

  embeddings = outputs.last_hidden_state.squeeze(0)

  pronoun_index = None
  tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence['input_ids'].squeeze(0))

  for index, token in enumerate(tokens):
        if pronoun.lower() == token.lower():
            pronoun_index = index
            break

  if pronoun_index is not None:
        pronoun_embedding = embeddings[pronoun_index]
        return pronoun_embedding
  else:
      return None

In [9]:
#average all embeddings per pronoun

def average_embeddings(pronoun_dict, tokenizer, model):
    pronoun_embeddings = {}

    for pronoun, sentences in pronoun_dict.items():
        embeddings = []

        for sentence in sentences:
            embedding = pronoun_embedding(sentence, pronoun, tokenizer, model)
            if embedding is not None:
                embeddings.append(embedding)

        if len(embeddings) != 0:
            avg_embedding = torch.mean(torch.stack(embeddings), dim=0)
            pronoun_embeddings[pronoun] = avg_embedding

    return pronoun_embeddings

average_embeddings_dict = average_embeddings(pronoun_dict, tokenizer, model)

In [10]:
print(average_embeddings_dict)

{'somebody': tensor([ 3.4068e-01,  2.0021e-01,  3.6116e-01, -8.2985e-02,  1.7348e-01,
         5.1130e-01,  2.3841e-01,  9.3146e-01, -4.3938e-01, -2.6642e-01,
         1.5537e-01, -5.1557e-01, -3.9677e-02,  4.4010e-01, -6.5021e-01,
         1.5900e-01, -7.8113e-02, -2.9340e-01, -1.5872e-01,  2.1088e-01,
         1.4265e-01,  1.5540e-01, -5.8461e-01,  3.2609e-01,  8.5488e-02,
         4.8938e-01,  3.2595e-01, -1.7764e-01, -9.1639e-02,  1.9073e-01,
         6.1263e-01,  1.1125e-01,  5.3384e-02, -4.1988e-01, -3.5548e-01,
        -1.7004e-02,  3.4375e-01,  2.5634e-01, -2.2395e-01, -4.1946e-02,
        -4.9737e-01, -7.3993e-01, -3.8157e-01,  3.5038e-02, -6.4744e-02,
        -1.3244e-01,  3.0229e-01,  4.0455e-01,  3.0134e-01, -8.6208e-01,
        -5.0137e-01,  2.5509e-01, -9.2326e-01, -3.5359e-01,  4.4742e-01,
         3.3704e-01,  4.5537e-01, -6.4989e-01, -1.7045e-01, -1.2984e-01,
        -9.4850e-01, -3.3925e-01,  3.4771e-01, -7.3043e-01,  7.7977e-02,
         7.4731e-01,  2.3844e-01,  2.1

In [11]:
# implication map, according to the map presented by Haspelmath (1997, p. 65)

      # specific known, specific unknown, irrealis non-specific, question, conditional, indirect negation, comparative, direct negation, free-choice
implication_map = {'somebody': np.array([1,1,1,1,1,0,0,0,0]),
                'nobody': np.array([0,0,0,0,0,0,0,1,0]),
                'anybody': np.array([0,0,0,1,1,1,1,1,1])}


                # 'anyone': np.array([0,0,0,1,1,1,1,1,1]),
                # 'someone': np.array([1,1,1,1,1,0,0,0,0]),
                # 'something': np.array([1,1,1,1,1,0,0,0,0]),
                # 'nothing': np.array([0,0,0,0,0,0,0,1,0]),
                # 'anything': np.array([0,0,0,1,1,1,1,1,1])}

In [15]:
#cosine distances for all pronoun combos in embeddings

# pronouns = ['somebody', 'nobody', 'anybody']

embeddings_cosine_distances = []

for pronoun_1, embedding_1 in average_embeddings_dict.items():
    for pronoun_2, embedding_2 in average_embeddings_dict.items():
        if pronoun_1 != pronoun_2:
            cosine_distance = cosine(embedding_1.numpy(), embedding_2.numpy())
            embeddings_cosine_distances.append((pronoun_1, pronoun_2, cosine_distance))

print(embeddings_cosine_distances)

#only the distances
embeddings_distances = [distance for pro1, pro2, distance in embeddings_cosine_distances]
embeddings_distances

[('somebody', 'nobody', np.float32(0.27427882)), ('somebody', 'anybody', np.float32(0.21470237)), ('nobody', 'somebody', np.float32(0.27427882)), ('nobody', 'anybody', np.float32(0.22016323)), ('anybody', 'somebody', np.float32(0.21470237)), ('anybody', 'nobody', np.float32(0.22016323))]


[np.float32(0.27427882),
 np.float32(0.21470237),
 np.float32(0.27427882),
 np.float32(0.22016323),
 np.float32(0.21470237),
 np.float32(0.22016323)]

In [16]:
#cosine distance for all pronoun combos in implicational map

implication_map_cosine_distances = []

for pronoun_1, embedding_1 in implication_map.items():
    for pronoun_2, embedding_2 in implication_map.items():
        if pronoun_1 != pronoun_2:
            cosine_distance = cosine(embedding_1, embedding_2)
            implication_map_cosine_distances.append((pronoun_1, pronoun_2, cosine_distance))

print(implication_map_cosine_distances)

#only the distances
implication_distances = [distance for pro1, pro2, distance in implication_map_cosine_distances]
implication_distances

[('somebody', 'nobody', np.float64(1.0)), ('somebody', 'anybody', np.float64(0.6348516283298893)), ('nobody', 'somebody', np.float64(1.0)), ('nobody', 'anybody', np.float64(0.5917517095361369)), ('anybody', 'somebody', np.float64(0.6348516283298893)), ('anybody', 'nobody', np.float64(0.5917517095361369))]


[np.float64(1.0),
 np.float64(0.6348516283298893),
 np.float64(1.0),
 np.float64(0.5917517095361369),
 np.float64(0.6348516283298893),
 np.float64(0.5917517095361369)]

In [14]:
#Spearman correlation between implication and embeddings distances

correlation, p_value = spearmanr(embeddings_distances, implication_distances)

print("Correlation:", correlation)
print("P value:", p_value)

Correlation: 0.5000000000000001
P value: 0.31250000000000006
