In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/abhishek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn


def read_semcor_data(path):
    """
    Read SemCor data from a directory and return a pandas DataFrame
    """
    data = []
    for subdir, _, files in os.walk(path):
        for file in files:
            if file.endswith('.naf'):
                file_path = os.path.join(subdir, file)
                tree = ET.parse(file_path)
                root = tree.getroot()
                context_pos_sent = []
                context_sent = []
                cont_dict = {}
                for text in root.findall('./text'):
                    for wf in text.findall('wf'):
                        key = wf.attrib['id'] + "%" + file_path
                        cont_dict[key] = wf.text

                # add the pos tag to the context
                for term in root.findall('./terms/term'):
                    target_word = term.find(
                        "./span/target").attrib['id'] + "%" + file_path
                    pos = term.attrib['pos']
                    context_pos_sent.append(cont_dict[target_word] + '_' + pos)
                    context_sent.append(cont_dict[target_word])
                context_pos = ' '.join(context_pos_sent)
                context = ' '.join(context_sent)

                for term in root.findall('./terms/term'):
                    sense_number = term.find(
                        "./externalReferences/externalRef[@reftype='sense_number']")
                    lemma = term.attrib['lemma']
                    lexical_key = term.find(
                        "./externalReferences/externalRef[@reftype='lexical_key']")
                    target_word = term.find("./span/target").attrib['id']
                    gloss = ""
                    wn_index = lemma
                    is_proper_gloss = False
                    synset_full = ""
                    if lexical_key is not None:
                        lexical_key = lexical_key.attrib['reference']
                    if sense_number is not None and sense_number.attrib['reference'] != "0":
                        wn_index = lemma + "%" + lexical_key

                        synset_val = term.find(
                            "./externalReferences/externalRef[@reftype='synset']")

                        if synset_val is not None:
                            synset_val = synset_val.attrib['reference']
                            synset_obj = wn.synset_from_pos_and_offset(
                                synset_val[-1], int(synset_val[6:-2]))
                            synset_full = synset_obj.name()
                            # print(synset_full)
                            # synset_name = synset_full[:synset_full.index('.')]
                            flg = 0
                            for sense in wn.synsets(lemma):
                                if sense.name() == synset_full:
                                    gloss = sense.definition()
                                    flg = 1
                                    break
                            if flg == 1:
                                is_proper_gloss = True

                    data.append({
                        "file": file_path,
                        "context": context,
                        "context_pos": context_pos,
                        "target_word": target_word,
                        "gloss": gloss,
                        "is_proper_gloss": is_proper_gloss,
                        "wn_index": wn_index,
                        "sense_full": synset_full,
                    })

    return pd.DataFrame(data)


In [29]:
filePath = "./semcor3.0/brownv"
df = read_semcor_data(filePath)

In [30]:
df.head(20)


Unnamed: 0,file,context,context_pos,target_word,gloss,is_proper_gloss,wn_index,sense_full
0,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w0,,False,the,
1,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w1,,False,baltimore,
2,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w2,,False,and,
3,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w3,,False,ohio_railroad,
4,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w4,make known; make an announcement,True,announce%2:32:00::,announce.v.01
5,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w5,,False,yesterday,
6,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w6,,False,it,
7,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w7,,False,would,
8,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w8,cut down on; make a reduction in,True,reduce%2:30:00::,reduce.v.01
9,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w9,,False,the,


In [31]:
import tqdm 
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_semcor_data(dataset):
    cnt = 0
    # make copy of dataset
    data = dataset.copy()
    for i, row in tqdm.tqdm(dataset.iterrows(), total=len(dataset)):
        context_words = nltk.word_tokenize(row["context"])
        context_pos_words = nltk.word_tokenize(row["context_pos"])
        target_word_lemma = row["wn_index"].split("%")[0]
        synsets = wn.synsets(target_word_lemma)
        trg_idx = int(row['target_word'][1:])
        
        if len(synsets) >0:
            target_synset = synsets[0]
        else:
            target_synset = None
       
        window_start = max(0, trg_idx - 10)
        window_end = min(len(context_words), trg_idx + 10)
        context_window = context_words[window_start:trg_idx] + context_words[trg_idx+1:window_end]
        context_window_pos = context_pos_words[window_start:trg_idx] + context_pos_words[trg_idx+1:window_end]
        # print(len(context_window))
        if len(context_window) == 0:
            cnt = cnt + 1
        data.at[i, 'context'] = ' '.join(context_window)
        data.at[i, 'context_pos'] = ' '.join(context_window_pos)
        
    data =  data[data["gloss"]!=""]
    data = data[data["context_pos"]!=""]
    data = data[data["context"]!=""]
    # dataset = dataset.drop(columns=["wn_index"])
    print("Number of proper glosses: ", data["is_proper_gloss"].sum())
    print(cnt)
    return data


In [35]:
dataset = preprocess_semcor_data(df[:50000])

100%|██████████| 50000/50000 [15:55<00:00, 52.33it/s]

Number of proper glosses:  5756
6324





In [39]:
# save the dataset
dataset.to_csv("semcor5.csv", index=False)

In [37]:
# remove the row if sense_full if empty
dataset = dataset[dataset["sense_full"]!=""]


In [38]:
dataset.head(20)

Unnamed: 0,file,context,context_pos,target_word,gloss,is_proper_gloss,wn_index,sense_full
4,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad yesterday it w...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w4,make known; make an announcement,True,announce%2:32:00::,announce.v.01
8,./semcor3.0/brownv/br-a19.naf,The Baltimore and Ohio_Railroad announced yest...,The_DT Baltimore_NNP and_CC Ohio_Railroad_NNP ...,w8,cut down on; make a reduction in,True,reduce%2:30:00::,reduce.v.01
33,./semcor3.0/brownv/br-a19.naf,at 12.01 A._M . next Saturday The current mont...,effective_JJ at_IN 12.01_CD A._M._NNP next_JJ ...,w34,add up in number or quantity,True,come%2:42:12::,total.v.01
42,./semcor3.0/brownv/br-a19.naf,15000000 Howard_E._Simpson the railroad 's pre...,15000000_CD Howard_E._Simpson_NNP the_DT railr...,w47,express in words,True,say%2:32:00::,state.v.01
62,./semcor3.0/brownv/br-a19.naf,of heavy goods has necessitated this regrettab...,of_IN heavy_JJ goods_NNS has_VBZ necessitated_...,w69,cause to be a concomitant,True,necessitate%2:42:01::,necessitate.v.02
71,./semcor3.0/brownv/br-a19.naf,expenses will affect employees in the thirteen...,expenses_NNS will_MD affect_VB employees_NNS i...,w80,connect closely and often incriminatingly,True,affect%2:42:00::,involve.v.01
81,./semcor3.0/brownv/br-a19.naf,O. operates It will be accomplished in_two_way...,O._NNP operates_VB It_PRP will_MD be_VB accomp...,w91,"direct or control; projects, businesses, etc.",True,operate%2:41:00::,operate.v.01
85,./semcor3.0/brownv/br-a19.naf,accomplished in_two_ways A flat reduction of 1...,accomplished_VB in_two_ways_RB A_DT flat_JJ re...,w96,put in effect,True,accomplish%2:36:00::,carry_through.v.01
104,./semcor3.0/brownv/br-a19.naf,There are about 3325 officers and employees in...,There_EX are_VB about_RB 3325_CD officers_NNS ...,w117,be a part or adjunct,True,belong_to%2:42:00::,belong_to.v.01
107,./semcor3.0/brownv/br-a19.naf,officers and employees in_this class Sufficien...,officers_NNS and_CC employees_NNS in_this_RB c...,w121,"have an existence, be extant",True,be%2:42:00::,exist.v.01


In [8]:
import torch
device = "cpu"
print(device)

cpu


In [40]:
# read csv 
data1 = pd.read_csv("semcor3.csv")
data2 = pd.read_csv("semcor4.csv")
dataset = pd.concat([data1, data2], ignore_index=True)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(dataset, test_size=0.2, random_state=42)


In [42]:
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import torch
import numpy as np 

max_len = 20

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to(device)

def create_embeddings(sentences, tokenizer, model, max_length):
    input_ids = []
    attention_masks = []

    # Tokenize each sentence and add special tokens for BERT
    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the tokenized input into torch tensors
    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)


    # Feed the input to BERT and get the embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    return embeddings


embeddings = create_embeddings(
    X_train['context'], tokenizer, model, max_len)
np.save('knn_bert4.npy', embeddings)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

In [12]:

print(embeddings.shape)

(13305, 768)


In [16]:
test_embeddings = create_embeddings(
    X_test['context'], tokenizer, model, max_len)


In [17]:


XTrain = pd.DataFrame(columns=['file', 'context', 'target_word', 'gloss', 'is_proper_gloss','wn_index'])
XTest = pd.DataFrame(columns=['file', 'context', 'target_word', 'gloss', 'is_proper_gloss','wn_index'])

XTrain = pd.concat([XTrain, X_train], ignore_index=True)
XTest = pd.concat([XTest, X_test], ignore_index=True)

XTrain.head(10)

cosine_similarities = []
zero_cos = 0

for i,test_embedding in enumerate(test_embeddings):
    test_target_word = XTest.iloc[i]['wn_index']
    train_rows = XTrain[XTrain['wn_index'] == test_target_word ]
    for j,train_row in train_rows.iterrows():
        train_embedding = embeddings[j]
        similarity = np.dot(test_embedding, train_embedding) / (np.linalg.norm(test_embedding) * np.linalg.norm(train_embedding))
        cosine_similarities.append(similarity)
    if len(cosine_similarities) == 0:
        zero_cos += 1
    if len(cosine_similarities) > 0 :
        # without using loc
        XTest.at[i, 'cosine_similarity'] = max(cosine_similarities)
    else:
        # X_test.loc[X_test.index[i], 'cosine_similarity'] = -1
        XTest.at[i, 'cosine_similarity'] = -1
    cosine_similarities = []


XTest['correct'] = XTest.apply(lambda row: row['gloss'] in XTrain[XTrain['wn_index']
                                 == row['wn_index']].head(10)['gloss'].tolist(), axis=1)

# # print the top-5 glosses of X_train for every X_test row 
X_test['top_5_glosses'] = X_test.apply(lambda row: X_train[X_train['wn_index'] == row['wn_index']].head(5)['gloss'].tolist(), axis=1)





### Accuracy of KNN

In [19]:

accuracy = XTest['correct'].sum() / len(XTest)
print("Accuracy: ", accuracy*100)
X_test.head(10)

Accuracy:  67.53100338218715


Unnamed: 0,file,context,context_pos,target_word,gloss,is_proper_gloss,wn_index,sense_full,top_5_glosses
6113,./semcor3.0/brown1/br-b13.naf,get_into crevices jacket and crown margins mal...,small_JJ enough_RB to_TO get_into_VB crevices_...,w487,clean with a brush,True,brush%2:35:02::,brush.v.03,[clean with a brush]
2453,./semcor3.0/brown1/br-k03.naf,love the President like a brother but God_damn...,and_CC pull_down_VB our_PRP $ own_JJ defenses_...,w1842,use or exercise the mind or one's power of rea...,True,think%2:31:00::,think.v.03,[use or exercise the mind or one's power of re...
2442,./semcor3.0/brown1/br-k03.naf,what happened to me today A fellow came_up_to ...,and_CC if_IN you_PRP want_VB to_VB get_along_w...,w1818,a public promotion of some product or service,True,advertising%1:10:00::,ad.n.01,[]
2859,./semcor3.0/brown1/br-j57.naf,Christ and still in_use among the truest socie...,book_NN We_PRP find_VB it_PRP in_RB that_RB co...,w796,"have the quality of being; (copula, used with ...",True,be%2:42:03::,be.v.01,"[have the quality of being; (copula, used with..."
12278,./semcor3.0/brown1/br-l12.naf,'s just barely possible with this crowd that t...,Skolman_NNP see_VB if_IN some_DT kind_NN of_RB...,w734,"happen, occur, take place",True,be%2:42:04::,be.v.05,"[happen, occur, take place, happen, occur, tak..."
6298,./semcor3.0/brown1/br-b13.naf,the sung word is as old as Thomas_Alva_Edison ...,ever_RB before_RB although_IN the_DT spoken_JJ...,w868,grow smaller,True,go_down%2:30:00::,decline.v.04,[]
7109,./semcor3.0/brown1/br-k16.naf,Its groin was bloody Black strips of skin hung...,thin_JJ legs_NN and_CC its_PRP $ wings_NN were...,w695,a junction where one street or road crosses an...,True,crossroad%1:06:00::,intersection.n.02,[]
12356,./semcor3.0/brown1/br-l12.naf,Sure I know But it 's such a long_shot No than...,see_eye_to_eye_VB Lieutenant_NN Did_VBD n't_RB...,w938,a human being,True,person%1:03:00::,person.n.01,"[a human being, a human being, a human being, ..."
5173,./semcor3.0/brown1/br-k15.naf,baby 's death Juanita drooped about the place ...,the_DT girl_NN tried_VB to_TO insist_VB on_IN ...,w397,develop (children's) behavior by instruction a...,True,discipline%2:41:01::,discipline.v.01,[]
11199,./semcor3.0/brown1/br-k01.naf,the food a topic The blueberry_pie is good Sco...,and_CC the_DT bright_JJ ranks_NN of_IN college...,w283,any substance that can be metabolized by an an...,True,food%1:03:00::,food.n.01,[any substance that can be metabolized by an a...


# Naive Bayes

In [17]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
data = pd.read_csv("semcor3.csv")
print(len(data))
test_data = data[:500]
train_data = data[500:]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Compute BERT embeddings for each sentence and target word
embeddings = []
for i, row in data.iterrows():
    context = row['context']
    context_pos = row['context_pos']
    # print(len(context_pos.split()))
    target_word = row['sense_full']
    gloss = row['gloss']
    tokens = tokenizer.encode(context, target_word, add_special_tokens=False)
    pos_tag_tokens = tokenizer.encode(context, add_special_tokens=False)
    # shuffle the pos_tag_tokens list
    np.random.shuffle(pos_tag_tokens)
    gloss_tokens = tokenizer.encode(gloss, add_special_tokens=False)
    tokens = tokens + [tokenizer.sep_token_id] + gloss_tokens 

    input_ids = torch.tensor(tokens).unsqueeze(0).to(device)
    pos_tag_ids = torch.tensor(pos_tag_tokens).unsqueeze(0).to(device)
    # Compute BERT embeddings
    with torch.no_grad():
        output = model(input_ids)
        context_embedding = output[0][0][1:-1].mean(dim=0).cpu()
        pos_embedding = model(pos_tag_ids)[0][0][1:-1].mean(dim=0).cpu()

    embedding = torch.cat([context_embedding, pos_embedding], dim=0).cpu().numpy()
    embeddings.append(embedding) 



8941
cuda:0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# save the embeddings
# embedding = embeddings.cpu().numpy()
np.save('naive_bert_embeddings.npy', embeddings)

In [20]:
import numpy as np
x_train = np.array(embeddings)
y_train = data['wn_index'].values
# print(x_train[0])


In [21]:
test_embeddings = embeddings[:500]
x_test = np.array(test_embeddings)

In [22]:
prior_probs = {}
likelihoods = {}
y_test = test_data['wn_index'].values

for sense in set(y_train):
    sense_count = sum(y_train == sense)
    # print(sense,sense_count)
    prior_probs[sense] = sense_count / len(y_train)   
    sense_embeddings = [x_train[i] for i in range(len(x_train)) if y_train[i] == sense]
    likelihoods[sense] = np.mean(sense_embeddings, axis=0)


In [23]:
y_pred = []
for i in range(0,len(y_test)):
    max_prob = 0
    max_sense = None
    for sense in set(y_train):
        log_prob = prior_probs[sense]
        # print(log_prob)
        for j in range(len(x_test[i])):
            log_prob = log_prob + x_test[i][j] * likelihoods[sense][j]
        if log_prob > max_prob:
            max_prob = log_prob
            max_sense = sense
    y_pred.append(max_sense)

accuracy = sum(y_pred[i] == y_test.tolist()[i]
               for i in range(len(y_test)))/len(y_test)  


## Accuracy with context + poss

In [24]:
print(accuracy*100)

40.2


## Accuracy with context as only feature

In [9]:
print(accuracy*100)

51.0
