In [None]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('semcor')
nltk.download('stopwords')
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords 
from nltk.corpus import semcor
import networkx as nx
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sys import argv
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package semcor to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from gensim.models import KeyedVectors
model_w2v = KeyedVectors.load_word2vec_format('drive/MyDrive/CS626/Assignment_2/GoogleNews-vectors-negative300.bin',binary=True)

In [None]:
stop_words = set(stopwords.words('english'))
vectors = {}
semcor_sents = semcor.sents()
for sentence in semcor_sents:
    for word in sentence:
        if word in model_w2v:
            vectors[word] = model_w2v.get_vector(word)

In [None]:
def sent_vec(sentence):
    li = []
    for word in sentence:
        if word in vectors:
            li.append(vectors[word])
    if len(li) == 0:
        return np.zeros(300)
    return np.average(li,axis=0)

In [None]:
def leskSim(sense1,sense2):
  sent1 = [w for w in sense1.split()]
  sent2 = [w for w in sense2.split()]
  X = []
  X.append(sent_vec(sent1))
  X.append(sent_vec(sent2))
  return cosine_similarity(X)[0][1]

In [None]:
def edge_weight(g,sense,layer):
    #lets pass a one sense of a word and the trailing layer.
    #find the corresponding definations
    def1 = wn.synset(sense).definition()
    def2 = {}
    for i in range(len(layer)):
      try:
            def2[i] = wn.synset(layer[i]).definition()
      except ValueError:
            print(layer)
            break
      edge_weight = leskSim(def1,def2[i])#pass two sense definitions
      g.add_edge(sense,layer[i],weight=edge_weight)


In [None]:
def graph(sent1):#takes two sentence and generates wordsense graph
    G = nx.Graph()
    tokenizer = RegexpTokenizer(r'\w+')
    s1=tokenizer.tokenize(sent1)
    dict1={}
    
    for i in range(len(s1)):
        dict1[i] = [str(k.name()) for k in wn.synsets(s1[i])]  #all the word senses of i th word here 

 
    for i in dict1.keys():
        G.add_nodes_from(dict1[i])
        
    #here all the nodes are added
    #add the edges to it
    for l in dict1.keys():
        for senses in dict1[l]:
            if l<(len(dict1)-1):           
                edge_weight(G,senses,dict1[l+1])
    return G

In [None]:
def get_Ranks(Graph):
    rank = nx.pagerank(Graph,alpha=0.4)
    return rank

In [None]:
def senseAssignment(senseDict,Ranks):
    SenseLst=[]
    for word in senseDict:
        maxRank=0
        Sense=""
        for sense in senseDict[word]:
          try:
            if maxRank < Ranks[sense]:
              maxRank = Ranks[sense]
              Sense = sense
          except:
            pass
        try:
            SenseLst.append(wn.synset(Sense).name()) #For definition wn.synset(Sense).definition() 
        except:
            SenseLst.append("notag")
    return SenseLst

In [None]:
def prediction(sen):
  sentence = ' '.join(sen)
  G=graph(sentence)
  ranks = nx.pagerank(G,alpha=0.4)
  s1=sen
  dict1={}
  for i in range(len(s1)):
    dict1[i] = [str(k.name()) for k in wn.synsets(s1[i])]  #all the word senses of i th word here

  senseLst = senseAssignment(dict1,ranks)
  senseLst_dict = dict()
  for i in range(len(s1)):
    senseLst_dict[s1[i]] = senseLst[i]
  return senseLst_dict

In [None]:
result = prediction(['my','country','india'])

In [None]:
print(result)

{'my': 'notag', 'country': 'nation.n.02', 'india': 'india.n.01'}


In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def get_sense_dict(sents):
    sense_set = set()
    for i in range(len(sents)):
        for j in range(len(sents[i])):
            if isinstance(sents[i][j], nltk.Tree):
                try :
                    if sents[i][j].height() == 3:
                        for tree in sents[i][j]:
                            if(tree.label() == 'NE'):
                                sense = 'NE'
                                sense_set.add(sense) 
                    else :
                        sense = sents[i][j].label().synset().name()
                        sense_set.add(sense)
 
                except:
                    if(sents[i][j].label() == 'NE'):
                        sense = sents[i][j].label()
                        sense_set.add(sense)
                    else :
                        sense = sents[i][j].label()
                        sense_set.add(sense)
            else :
                sense = 'notag'
                sense_set.add(sense)
    sense_set.add('unk')
    sense_set = list(sense_set)
    sense_dict = {sense : i for i, sense in enumerate(sense_set)}
    return sense_dict

In [None]:
def predict(sents, untagged_sents):
    pred_sense = []
    actual_sense = []
    for i in range(len(sents)):
        temp = prediction(untagged_sents[i])
        for j in range(len(sents[i])):
            if isinstance(sents[i][j], nltk.Tree):
                try :
                    if sents[i][j].height() == 3:
                        for tree in sents[i][j]:
                            if(tree.label() == 'NE'):
                                sense = 'NE'
                                actual_sense.append(sense)
                                word = "_".join(tree.leaves())
                    else :
                        sense = sents[i][j].label().synset().name()
                        actual_sense.append(sense)
                        word = "_".join(sents[i][j].leaves())
                    pred_sense.append(temp.get(word,'notag'))                  
                except:
                    if(sents[i][j].label() == 'NE'):
                        sense = sents[i][j].label()
                        word = "_".join(sents[i][j].leaves())
                        actual_sense.append(sense)
                    else :
                        sense = sents[i][j].label()
                        word = "_".join(sents[i][j].leaves())
                        actual_sense.append(sense)
                    pred_sense.append(temp.get(word,'notag'))                
            else :
                sense = 'notag'
                actual_sense.append(sense)
                word = "_".join(sents[i][j])
                pred_sense.append(temp.get(word,'notag'))          
    return actual_sense, pred_sense

In [None]:
sents = semcor.tagged_sents(tag='sem')
sense_dict = get_sense_dict(sents)
untagged_sents = semcor.sents()
y_true, y_pred = predict(sents, untagged_sents)

y_true = [sense_dict[sense] for sense in y_true]
y_pred = [sense_dict.get(sense, sense_dict['notag']) for sense in y_pred]

In [None]:
inv_map = {v: k for k, v in sense_dict.items()}
y_true = [inv_map[i] for i in y_true]
y_pred = [inv_map[i] for i in y_pred]
y_true_n, y_pred_n = [],[]
for i in range(len(y_true)):
  if(y_true[i] != 'notag' and y_true[i] != 'NE'):
    temp = y_true[i].split('.')[1]
    if(temp=='n'):
      y_true_n.append(y_true[i])
      y_pred_n.append(y_pred[i])

In [None]:
print("Accuracy : ", metrics.accuracy_score(y_true_n, y_pred_n))
precision, recall, f1score, _ = metrics.precision_recall_fscore_support(y_true, y_pred, average='weighted',zero_division=0)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 Score : ", f1score)

Accuracy :  0.3798242406667255
Precision :  0.6994952164393669
Recall :  0.5876234768882604
F1 Score :  0.621535092767554


In [None]:
test_sentence = "She is a government servant"
tokens = word_tokenize(test_sentence)
pred_dict = prediction(tokens)
for x,y in pred_dict.items():
  if(y!='notag'):
    print("Target Word : " + x + "\nPredicted Sense : " + y + " : " + wn.synset(y).definition() + "\n")
  else:
    print("Target Word : " + x + "\nPredicted Sense : " + y + "\n")

Target Word : She
Predicted Sense : notag

Target Word : is
Predicted Sense : be.v.12 : to remain unmolested, undisturbed, or uninterrupted -- used only in infinitive form

Target Word : a
Predicted Sense : deoxyadenosine_monophosphate.n.01 : one of the four nucleotides used in building DNA; all four nucleotides have a common phosphate group and a sugar (ribose)

Target Word : government
Predicted Sense : government.n.03 : (government) the system or form by which a community or other political unit is governed

Target Word : servant
Predicted Sense : servant.n.01 : a person working in the service of another (especially in the household)

