In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/abhishek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import os 
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn

def read_semcor_data(path):
    """
    Read SemCor data from a directory and return a pandas DataFrame
    """
    data = []
    for subdir,_,files in os.walk(path):
        for file in files:
            if file.endswith('.naf'):
                file_path = os.path.join(subdir, file)
                tree = ET.parse(file_path)
                root = tree.getroot()
                context_pos_sent = []
                context_sent = []
                cont_dict = {}
                for text in root.findall('./text'):
                    for wf in text.findall('wf'):
                        key = wf.attrib['id'] + "%" + file_path
                        cont_dict[key] = wf.text

                # add the pos tag to the context
                for term in root.findall('./terms/term'):
                    target_word = term.find("./span/target").attrib['id'] + "%" + file_path
                    pos = term.attrib['pos']
                    context_pos_sent.append(cont_dict[target_word] + '_' + pos)
                    context_sent.append(cont_dict[target_word])
                context_pos = ' '.join(context_pos_sent)
                context = ' '.join(context_sent)


                for term in root.findall('./terms/term'):
                    sense_number = term.find("./externalReferences/externalRef[@reftype='sense_number']")
                    lemma = term.attrib['lemma']
                    lexical_key = term.find("./externalReferences/externalRef[@reftype='lexical_key']")
                    target_word = term.find("./span/target").attrib['id']
                    gloss = ""
                    wn_index = lemma 
                    is_proper_gloss = False
                    if lexical_key is not None:
                        lexical_key = lexical_key.attrib['reference']
                    if sense_number is not None and sense_number.attrib['reference'] != "0":
                        wn_index = lemma + "%" + lexical_key 

                        synset_val = term.find("./externalReferences/externalRef[@reftype='synset']")

                        if synset_val is not None:
                            synset_val = synset_val.attrib['reference']
                            synset_obj = wn.synset_from_pos_and_offset(
                                synset_val[-1], int(synset_val[6:-2]))
                            synset_full = synset_obj.name()
                            # print(synset_full)
                            # synset_name = synset_full[:synset_full.index('.')]
                            flg = 0
                            for sense in wn.synsets(lemma):
                                if sense.name() == synset_full:
                                    gloss = sense.definition()
                                    flg = 1
                                    break
                            if flg == 1:
                                is_proper_gloss = True

           
                    data.append({
                        "file": file_path,
                        "context": context,
                        "context_pos": context_pos,
                        "target_word": target_word,
                        "gloss": gloss,
                        "is_proper_gloss": is_proper_gloss,
                        "wn_index": wn_index,
                        "sense_full": synset_full,
                    })
                        

    return pd.DataFrame(data)


In [3]:
filePath = "../semcor3.0/brown2"
df = read_semcor_data(filePath)

In [4]:
df.head(20)


Unnamed: 0,file,context,context_pos,target_word,gloss,is_proper_gloss,wn_index,sense_full
0,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w0,a human being,True,person%1:03:00::,person.n.01
1,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w2,,False,for,person.n.01
2,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w3,,False,several%5:00:00:some(a):00,person.n.01
3,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w4,a period of time containing 365 (or 366) days,True,year%1:28:01::,year.n.01
4,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w5,in the historical present; at this point in th...,True,now%4:02:05::,now.r.01
5,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w6,,False,the,now.r.01
6,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w7,a passenger train that is ridden primarily by ...,True,commuter%1:06:00::,commuter.n.01
7,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w8,line that is the commercial organization respo...,True,railroad%1:06:00::,railway.n.01
8,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w9,"devote (part of) one's life or efforts to, as ...",True,serve%2:41:02::,serve.v.07
9,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w10,,False,our,serve.v.07


In [6]:
import tqdm 
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_semcor_data(dataset):
    cnt = 0
    # make copy of dataset
    data = dataset.copy()
    for i, row in tqdm.tqdm(dataset.iterrows(), total=len(dataset)):
        context_words = nltk.word_tokenize(row["context"])
        context_pos_words = nltk.word_tokenize(row["context_pos"])
        target_word_lemma = row["wn_index"].split("%")[0]
        synsets = wn.synsets(target_word_lemma)
        trg_idx = int(row['target_word'][1:])
        
        if len(synsets) >0:
            target_synset = synsets[0]
        else:
            target_synset = None
       
        window_start = max(0, trg_idx - 10)
        window_end = min(len(context_words), trg_idx + 10)
        context_window = context_words[window_start:trg_idx] + context_words[trg_idx+1:window_end]
        context_window_pos = context_pos_words[window_start:trg_idx] + context_pos_words[trg_idx+1:window_end]
        # print(len(context_window))
        if len(context_window) == 0:
            cnt = cnt + 1
        data.at[i, 'context'] = ' '.join(context_window)
        data.at[i, 'context_pos'] = ' '.join(context_window_pos)
        
    data =  data[data["gloss"]!=""]
    data = data[data["context_pos"]!=""]
    data = data[data["context"]!=""]
    # dataset = dataset.drop(columns=["wn_index"])
    print("Number of proper glosses: ", data["is_proper_gloss"].sum())
    print(cnt)
    return data


In [7]:
dataset = preprocess_semcor_data(df[:20000])

100%|██████████| 20000/20000 [04:36<00:00, 72.39it/s]

Number of proper glosses:  8941
2406





In [8]:
# save the dataset
dataset.to_csv("semcor3.csv", index=False)

In [9]:
dataset.head(20)

Unnamed: 0,file,context,context_pos,target_word,gloss,is_proper_gloss,wn_index,sense_full
0,../semcor3.0/brown2/br-h13.naf,for several years now the commuter railroads s...,for_IN several_JJ years_NN now_RB the_DT commu...,w0,a human being,True,person%1:03:00::,person.n.01
3,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years the commuter rai...,Mr._Speaker_NNP for_IN several_JJ years_NN the...,w4,a period of time containing 365 (or 366) days,True,year%1:28:01::,year.n.01
4,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now commuter rai...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w5,in the historical present; at this point in th...,True,now%4:02:05::,now.r.01
6,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w7,a passenger train that is ridden primarily by ...,True,commuter%1:06:00::,commuter.n.01
7,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w8,line that is the commercial organization respo...,True,railroad%1:06:00::,railway.n.01
8,../semcor3.0/brown2/br-h13.naf,Mr._Speaker for several years now the commuter...,Mr._Speaker_NNP for_IN several_JJ years_NN now...,w9,"devote (part of) one's life or efforts to, as ...",True,serve%2:41:02::,serve.v.07
10,../semcor3.0/brown2/br-h13.naf,for several years now the commuter railroads s...,for_IN several_JJ years_NN now_RB the_DT commu...,w11,above average in size or number or quantity or...,True,large%3:00:00::,large.a.01
11,../semcor3.0/brown2/br-h13.naf,several years now the commuter railroads servi...,several_JJ years_NN now_RB the_DT commuter_NN ...,w12,relating to or characteristic of a metropolis,True,metropolitan%3:01:00::,metropolitan.a.01
12,../semcor3.0/brown2/br-h13.naf,years now the commuter railroads serving our l...,years_NN now_RB the_DT commuter_NN railroads_N...,w13,a particular geographical region of indefinite...,True,area%1:15:01::,area.n.01
14,../semcor3.0/brown2/br-h13.naf,the commuter railroads serving our large metro...,the_DT commuter_NN railroads_NN serving_VB our...,w15,"establish after a calculation, investigation, ...",True,find%2:32:00::,determine.v.01


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(dataset, test_size=0.2, random_state=42)


In [15]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import torch
import numpy as np 

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
max_len = 20


def create_embeddings(sentences, tokenizer, model, max_length):
    input_ids = []
    attention_masks = []

    # Tokenize each sentence and add special tokens for BERT
    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the tokenized input into torch tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)


    # Feed the input to BERT and get the embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()

    return embeddings


embeddings = create_embeddings(
    X_train['context'], tokenizer, model, max_len)
np.save('knn_bert2.npy', embeddings)




  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 12.8kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 310kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.60MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.30MB/s]
Downloading pytorch_model.bin: 100%|██████████| 440M/440M [01:13<00:00, 6.00MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another ar

In [13]:

print(embeddings.shape)

(7152, 768)


In [14]:
test_embeddings = create_embeddings(
    X_test['context'], tokenizer, model, max_len)

In [15]:


XTrain = pd.DataFrame(columns=['file', 'context', 'target_word', 'gloss', 'is_proper_gloss','wn_index'])
XTest = pd.DataFrame(columns=['file', 'context', 'target_word', 'gloss', 'is_proper_gloss','wn_index'])

XTrain = pd.concat([XTrain, X_train], ignore_index=True)
XTest = pd.concat([XTest, X_test], ignore_index=True)

XTrain.head(10)

cosine_similarities = []
zero_cos = 0

for i,test_embedding in enumerate(test_embeddings):
    test_target_word = XTest.iloc[i]['wn_index']
    train_rows = XTrain[XTrain['wn_index'] == test_target_word ]
    for j,train_row in train_rows.iterrows():
        train_embedding = embeddings[j]
        similarity = np.dot(test_embedding, train_embedding) / (np.linalg.norm(test_embedding) * np.linalg.norm(train_embedding))
        cosine_similarities.append(similarity)
    if len(cosine_similarities) == 0:
        zero_cos += 1
    if len(cosine_similarities) > 0 :
        # without using loc
        XTest.at[i, 'cosine_similarity'] = max(cosine_similarities)
    else:
        # X_test.loc[X_test.index[i], 'cosine_similarity'] = -1
        XTest.at[i, 'cosine_similarity'] = -1
    cosine_similarities = []


XTest['correct'] = XTest.apply(lambda row: row['gloss'] in XTrain[XTrain['wn_index']
                                 == row['wn_index']].head(10)['gloss'].tolist(), axis=1)

# # print the top-5 glosses of X_train for every X_test row 
X_test['top_5_glosses'] = X_test.apply(lambda row: X_train[X_train['wn_index'] == row['wn_index']].head(5)['gloss'].tolist(), axis=1)





### Accuracy of KNN

In [16]:

accuracy = XTest['correct'].sum() / len(XTest)
print("Accuracy: ", accuracy*100)
X_test.head(10)

Accuracy:  67.13247624371157


Unnamed: 0,file,context,context_pos,target_word,gloss,is_proper_gloss,wn_index,sense_full,top_5_glosses
13331,../semcor3.0/brown2/br-n15.naf,which was up and which was down He held the wh...,His_PRP $ speed_NN was_VBD dropping_VB rapidly...,w1934,done or occurring in a brief period of time,True,rapid%5:00:00:fast:01,rapid.s.01,[]
9843,../semcor3.0/brown2/br-h11.naf,testing of new apparatus to measure other prop...,testing_NN of_IN new_JJ apparatus_NN to_TO mea...,w96,determine the measurements of something or som...,True,measure%2:31:00::,measure.v.01,[determine the measurements of something or so...
8804,../semcor3.0/brown2/br-g12.naf,not attempt to answer I asked about the battle...,all_DT is_VB settled_JJ But_CC it_PRP is_VB di...,w1172,capable of being foretold,True,predictable%3:00:00::,predictable.a.01,[]
19525,../semcor3.0/brown2/br-g16.naf,artists who have documented the disintegrative...,generations_NN of_IN artists_NN who_WP have_VB...,w84,record in detail,True,document%2:32:00::,document.v.01,[]
12028,../semcor3.0/brown2/br-n15.naf,watching for every speck in the sky Greg rumbl...,combat_mission_NN Yet_RB long_RB before_IN the...,w424,a human being,True,person%1:03:00::,person.n.01,"[a human being, a human being, a human being, ..."
2716,../semcor3.0/brown2/br-e28.naf,important If you have a higher quality product...,can_MD you_PRP cash_in_on_VB this_DT fast-grow...,w888,a business relation in which two parties compe...,True,competition%1:24:01::,competition.n.01,[a business relation in which two parties comp...
13323,../semcor3.0/brown2/br-n15.naf,back He fought the panic of vertigo He had no ...,him_PRP near_IN the_DT overcast_NN almost_RB i...,w1923,rise dramatically,True,shoot_up%2:30:00::,shoot_up.v.01,[]
14189,../semcor3.0/brown2/br-l16.naf,over the laundry_truck One more muddleheaded p...,if_IN he_PRP would_MD make_VB a_DT fool_NN of_...,w734,make it possible through a specific action or ...,True,let%2:41:00::,let.v.01,[make it possible through a specific action or...
15648,../semcor3.0/brown2/br-g17.naf,an automobile ride from Memphis to Hattiesburg...,an_DT automobile_NN ride_NN from_IN Memphis_NN...,w56,a motor vehicle with four wheels; usually prop...,True,automobile%1:06:00::,car.n.01,[a motor vehicle with four wheels; usually pro...
14253,../semcor3.0/brown2/br-l16.naf,Griffith 's carrying his message and he had no...,had_VBD not_RB stayed_on_VB the_DT front_JJ st...,w822,subjected to great tension; stretched tight,True,taut%5:00:00:tense:03,taut.s.02,[]


# Naive Bayes

In [17]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
data = pd.read_csv("semcor3.csv")
print(len(data))
test_data = data[:500]
train_data = data[500:]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Compute BERT embeddings for each sentence and target word
embeddings = []
for i, row in data.iterrows():
    context = row['context']
    context_pos = row['context_pos']
    # print(len(context_pos.split()))
    target_word = row['sense_full']
    gloss = row['gloss']
    tokens = tokenizer.encode(context, target_word, add_special_tokens=False)
    pos_tag_tokens = tokenizer.encode(context, add_special_tokens=False)
    # shuffle the pos_tag_tokens list
    np.random.shuffle(pos_tag_tokens)
    gloss_tokens = tokenizer.encode(gloss, add_special_tokens=False)
    tokens = tokens + [tokenizer.sep_token_id] + gloss_tokens 

    input_ids = torch.tensor(tokens).unsqueeze(0).to(device)
    pos_tag_ids = torch.tensor(pos_tag_tokens).unsqueeze(0).to(device)
    # Compute BERT embeddings
    with torch.no_grad():
        output = model(input_ids)
        context_embedding = output[0][0][1:-1].mean(dim=0).cpu()
        pos_embedding = model(pos_tag_ids)[0][0][1:-1].mean(dim=0).cpu()

    embedding = torch.cat([context_embedding, pos_embedding], dim=0).cpu().numpy()
    embeddings.append(embedding) 



8941
cuda:0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# save the embeddings
# embedding = embeddings.cpu().numpy()
np.save('naive_bert_embeddings.npy', embeddings)

In [20]:
import numpy as np
x_train = np.array(embeddings)
y_train = data['wn_index'].values
# print(x_train[0])


In [21]:
test_embeddings = embeddings[:500]
x_test = np.array(test_embeddings)

In [22]:
prior_probs = {}
likelihoods = {}
y_test = test_data['wn_index'].values

for sense in set(y_train):
    sense_count = sum(y_train == sense)
    # print(sense,sense_count)
    prior_probs[sense] = sense_count / len(y_train)   
    sense_embeddings = [x_train[i] for i in range(len(x_train)) if y_train[i] == sense]
    likelihoods[sense] = np.mean(sense_embeddings, axis=0)


In [23]:
y_pred = []
for i in range(0,len(y_test)):
    max_prob = 0
    max_sense = None
    for sense in set(y_train):
        log_prob = prior_probs[sense]
        # print(log_prob)
        for j in range(len(x_test[i])):
            log_prob = log_prob + x_test[i][j] * likelihoods[sense][j]
        if log_prob > max_prob:
            max_prob = log_prob
            max_sense = sense
    y_pred.append(max_sense)

accuracy = sum(y_pred[i] == y_test.tolist()[i]
               for i in range(len(y_test)))/len(y_test)  


## Accuracy with context + poss

In [24]:
print(accuracy*100)

40.2


## Accuracy with context as only feature

In [9]:
print(accuracy*100)

51.0
