In [4]:
import nltk
from nltk.corpus import semcor
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [7]:
nltk.download('all-corpora')

[nltk_data] Downloading collection 'all-corpora'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/abhishek/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/abhishek/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /home/abhishek/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/abhishek/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/abhishek/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /home/abhishek/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /home/abhishek/nltk_data...

True

In [70]:
!pip3 install num2words 

Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting docopt>=0.6.2
  Using cached docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25ldone
[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=de84bdd7d99328acb113e907e06b057e134b2dfc66b85bb2ff3e3f8d67520535
  Stored in directory: /home/abhishek/.cache/pip/wheels/7c/d7/8d/2156234738063e3d4a39ba77dc677046100e62766b53807189
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.12


## Preprocessing

In [129]:
from num2words import num2words
import numpy as np
import tqdm.notebook as tqdm
from string import punctuation
import math
from nltk.corpus import wordnet as wn



lemmatizer = WordNetLemmatizer()

EXTRA_STOPWORDS = ["''","'s","``"]
STOPWORDS = nltk.corpus.stopwords.words('english') + EXTRA_STOPWORDS
STOPWORDS += list(punctuation)

def treebank2wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None
    

def syn2sense(syn):
    # get the sense (= lemma.postag.num) for a given synset
    s = syn.name()
    # n = ".".join(s.split(".")[-2:]) # n.01 and v.01 are different senses (eg: ash.n.01, ash.v.01)
    return s

def lemmatize(word,tag):
    if tag is None:
        return lemmatizer.lemmatize(word)
    else : 
        return lemmatizer.lemmatize(word,tag)
    
def num2Word(s):
    if s.isnumeric() and s.lower()!= "infinity" and s.lower()!="nan":
         s = num2words(s)
    return s

def clean(tokens):
    tagged = nltk.pos_tag(tokens)
    lemmatized = [lemmatize(w,treebank2wn(tag)) for w,tag in tagged]
    cleaned = [(num2Word(w), tag) for w, tag in zip(
        lemmatized, [tag for _, tag in tagged]) if w.lower() not in STOPWORDS]
    return cleaned



In [127]:
def parse(sent):
    tokens = []
    senses = []

    for e in sent:
        if isinstance(e, nltk.tree.Tree):
            lemma = e.label()
            if isinstance(lemma, nltk.corpus.reader.wordnet.Lemma):
                synset = lemma.synset()
                sense = syn2sense(synset)
            else:
                sense = None
            le = len(e)
            if le == 1:
                w = e[0]
                if isinstance(w, nltk.tree.Tree) or isinstance(w,list):
                    lw = len(w)
                    w = " ".join([w[i] for i in range(lw)])
            else:
                w = " ".join([e[i] for i in range(le)])
        elif isinstance(e, list):
            w = e[0]
            sense = None
        else:
            invtype = type(e)
            raise Exception("Invalid type: %s" % invtype)
        if w:
            tokens.append(w)
            senses.append(sense)
    return tokens, senses

In [130]:
semcor_data = semcor.tagged_sents(tag='sem')

X = []
y = []
words = []
for sent in semcor_data:
    try:
        tokens, senses = parse(sent)
        tagged_tokens = clean(tokens)
        
        for i in range(len(tokens)):
            context = tagged_tokens[max(
                0, i - 5):i] + tagged_tokens[i + 1:min(len(tagged_tokens), i + 6)]
            context_str = ' '.join(
                [w.lower() + '_' + tag for w, tag in context])
            X.append(context_str)
            y.append(senses[i])
    except Exception as e:
        print(e)




<class 'str'>


In [131]:
label_dict = {label: idx for idx, label in enumerate(set(y))}
y = [label_dict[label] for label in y]
print(y[0:10])

[16590, 15822, 7829, 21339, 16590, 2296, 16590, 206, 16590, 19458]


In [132]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()
X_ = vectorizer.fit_transform(X[:8000])
# print(X_)
# replace None in Y with the correct variable
y_ = y[:8000]
# y_ = vectorizer.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)


In [133]:
y_pred = nb_classifier.predict(X_test)


In [134]:
print("Accuracy: ", nb_classifier.score(X_test, y_test))

Accuracy:  0.46125
