In [1]:
from nltk.corpus.reader import BracketParseCorpusReader
from nltk.util import bigrams
from nltk.lm import MLE, Vocabulary, Laplace
from nltk.lm.preprocessing import pad_both_ends, flatten
from collections import Counter
import math

# Part 1

In [2]:
corpus = BracketParseCorpusReader(root="", fileids=["p1_train_StephLeblanc.txt"])
test_corpus = BracketParseCorpusReader(root="", fileids=["p1_test_StephLeblanc.txt"])

In [3]:
words = corpus.words()
words 

['A', 'warm', ',', 'funny', ',', 'engaging', 'film', ...]

In [4]:
vocab = Vocabulary(words,unk_cutoff=3)

In [5]:
last10 = sorted(vocab)[-10:]
last10 = [(last10[i], vocab[last10[i]]) for i in range(len(last10))]
last10  

[('written', 8),
 ('wrong', 7),
 ('wry', 3),
 ('year', 24),
 ('years', 12),
 ('yet', 20),
 ('you', 180),
 ('young', 14),
 ('your', 34),
 ('zeal', 3)]

In [6]:
all_voc = Vocabulary(vocab.counts,unk_cutoff=1)
all_voc.lookup("<s>")

'<UNK>'

In [7]:
oov_rate = (1-len(vocab)/len(all_voc)) * 100
oov_rate

77.57521672616012

# Part 2

In [8]:
def get_sequence(corpus:BracketParseCorpusReader):
    sentences = list(corpus.tagged_sents())
    for i in range(len(sentences)):
        sentences[i] = [j[0] for j in sentences[i]]
        
    text = list(flatten(pad_both_ends(sent, n=2) for sent in sentences))
    return text

def get_sentences(corpus:BracketParseCorpusReader):
    sentences = list(corpus.tagged_sents())
    for i in range(len(sentences)):
        sentences[i] = [j[0] for j in sentences[i]]
    return sentences

In [9]:
text = get_sequence(corpus)
bigram = list(list(bigrams(pad_both_ends(sent, n=2))) for sent in get_sentences(corpus))
v = Vocabulary(text, unk_cutoff=3)
len(vocab)

1759

In [10]:
mle = MLE(2)
mle.fit(bigram, v)

In [11]:
score_mle = {}
for word in v:
    score_mle[word] = mle.score(word,["<s>"])
dict(Counter(score_mle).most_common(5))
    

{'<UNK>': 0.2615, 'The': 0.1235, 'A': 0.091, 'It': 0.0715, 'This': 0.0315}

In [12]:
mle.score("The",["<s>"])

0.1235

In [13]:
mle.score("</s>",".".split())

0.9840510366826156

In [14]:
laplace = Laplace(2)
laplace.fit(bigram,v)

In [15]:
score_laplace = {}
for word in v:
    score_laplace[word] = laplace.score(word,["<s>"])
dict(Counter(score_laplace).most_common(5))

{'<UNK>': 0.1392876129718235,
 'The': 0.0659223817118554,
 'A': 0.04864433811802233,
 'It': 0.03827751196172249,
 'This': 0.01701222753854333}

In [16]:
laplace.score("aaaaa")

0.0005675368898978433

In [17]:
# Test set perplexity
test_padded = list(list(pad_both_ends(sent, n=2)) for sent in get_sentences(test_corpus))
test_padded = [list(bigrams(i)) for i in test_padded]
m = sum([len(i) for i in test_padded])

sum_prob = sum([math.log2(laplace.score(j[1], [j[0]])) for i in test_padded for j in i])
test_set_perplexity = 2**((-1/m)*(sum_prob))
test_set_perplexity

165.21112109883478

In [18]:
sum([len(i) for i in test_padded])

4162

# Part 3
## < 2 => negative class

In [19]:
bag_of_words = {k:v for k,v in vocab.counts.items() if v >= vocab.cutoff}
bigdoc = {k:[] for k in [0,1,2,3,4]}
bigdoc[1] = list(flatten([line.leaves() for line in corpus.parsed_sents() if int(line.label()) == 1]))
sents = corpus.parsed_sents()
list(flatten([line.leaves() for line in sents if int(line.label()) == 0])).count("a")

146

In [20]:
def train_bayes(doc:BracketParseCorpusReader, classes:list):
    
    sents = doc.parsed_sents()
    bigdoc = {k:[] for k in classes}
    n_doc = len(sents)
    voc = Vocabulary(doc.words(), unk_cutoff=3)
    # only keep words with frequency >= 3 as feature
    bag_of_words = {k:v for k,v in voc.counts.items() if v >= voc.cutoff}
    prior = {k:0 for k in classes}
    loglikelihood = {}
    
    for c in classes:
        n_c = len([1 for i in sents if int(i.label()) == c])
        prior[c] = math.log(n_c/n_doc)
        bigdoc[c] = list(flatten([line.leaves() for line in sents if int(line.label()) == c]))
        somme = sum([bigdoc[c].count(w) + 1 for w in bag_of_words.keys()])
        
        for word in bag_of_words.keys():
            count = bigdoc[c].count(word)
            loglikelihood[(word,c)] = math.log((count + 1)/ somme) #sum([bigdoc[c].count(w) + 1 for w in bag_of_words.keys()])
            
    return prior, loglikelihood, bag_of_words
        

In [21]:
def test_bayes(sentence:list, prior:dict, loglikelihood:dict, classes:list, voc:dict):
    somme = {}
    for c in classes:
        somme[c] = prior[c]
        for word in sentence:
            if word in voc.keys():
                somme[c] += loglikelihood[(word,c)]
    pred =  max(somme, key=somme.get)
    return pred >= 2

In [22]:
prior, loglik, v = train_bayes(corpus, [0,1,2,3,4])

In [23]:
test_bayes(test_corpus.sents()[1], prior, loglik, [0,1,2,3,4], v)

True

In [24]:
test_corpus.parsed_sents()[0].label()

'3'

In [25]:
def accuracy(voc, test_set, prior, loglik, classes):
    predictions = 0
    success = 0
    parsed_sents = test_set.parsed_sents()
    sents = test_set.sents()
    for i in range(len(sents)):
        predict = test_bayes(sents[i],prior, loglik, classes, voc)
        predictions += 1
        label = 1 if int(parsed_sents[i].label()) >= 2 else 0
            
        if predict == label:
            success += 1
    return success/predictions*100
        

In [26]:
accuracy(v, test_corpus, prior, loglik, [0,1,2,3,4])

66.5

# Part 4
remove duplicates in each sentence for question 1

In [27]:
clipped_corpus = list(flatten([list(set(i)) for i in corpus.sents()]))
clipped_test_corpus = list(flatten([list(set(i)) for i in test_corpus.sents()]))
v = Vocabulary(clipped_corpus,unk_cutoff=3)

In [28]:
def train_bayes_clip(doc:BracketParseCorpusReader, classes:list):
    
    sents = doc.parsed_sents()
    bigdoc = {k:[] for k in classes}
    n_doc = len(sents)
    clipped_corpus = list(flatten([list(set(i)) for i in doc.sents()]))
    voc = Vocabulary(clipped_corpus, unk_cutoff=3)
    # only keep words with frequency >= 3 as feature
    bag_of_words = {k:v for k,v in voc.counts.items() if v >= voc.cutoff}
    prior = {k:0 for k in classes}
    loglikelihood = {}
    
    for c in classes:
        n_c = len([1 for i in sents if int(i.label()) == c])
        prior[c] = math.log(n_c/n_doc)
        bigdoc[c] = list(flatten([line.leaves() for line in sents if int(line.label()) == c]))
        somme = sum([bigdoc[c].count(w) + 1 for w in bag_of_words.keys()])
        
        for word in bag_of_words.keys():
            count = bigdoc[c].count(word)
            loglikelihood[(word,c)] = math.log((count + 1)/ somme) #sum([bigdoc[c].count(w) + 1 for w in bag_of_words.keys()])
            
    return prior, loglikelihood, bag_of_words

def test_bayes_clip(sentence:list, prior:dict, loglikelihood:dict, classes:list, voc:dict):
    somme = {}
    for c in classes:
        somme[c] = prior[c]
        for word in sentence:
            if word in voc.keys():
                somme[c] += loglikelihood[(word,c)]
    pred =  max(somme, key=somme.get)
    return pred >= 2

def accuracy_clip(voc, test_set, prior, loglik, classes):
    predictions = 0
    success = 0
    parsed_sents = test_set.parsed_sents()
    sents = [list(set(i)) for i in test_set.sents()]
    for i in range(len(sents)):
        predict = test_bayes(sents[i],prior, loglik, classes, voc)
        predictions += 1
        label = 1 if int(parsed_sents[i].label()) >= 2 else 0
            
        if predict == label:
            success += 1
    return success/predictions*100

prior_clip, loglik_clip, v_clip = train_bayes_clip(corpus, [0,1,2,3,4])
accuracy_clip(v_clip, test_corpus, prior_clip, loglik_clip, [0,1,2,3,4])
        
        

66.0

In [29]:
neg = ["n't", "not", "no", "never"]
punc = ['.', ',', ':', '?', '!']

In [30]:
def neg_words_process(sent: list):
    """ Prepend the prefix NOT_ to every word after a token of logical negation till
        the next punctuation mark.

    Args:
        sent (list): list of words

    Returns:
        list: sent with modified words
    """
    tmp,i = 0, 0
    while i < len(sent):
        if sent[i].lower() in neg:
            tmp = i + 1
            try:
                while(sent[tmp]) not in punc:
                    sent[tmp] = f"NOT_{sent[tmp]}"
                    tmp += 1
                i = tmp
            except IndexError:
                break
        else:
            i += 1
    return sent

In [31]:
def train_bayes_neg(doc:BracketParseCorpusReader, classes:list):
    
    sents = doc.parsed_sents()
    bigdoc = {k:[] for k in classes}
    n_doc = len(sents)
    clipped_corpus = list(flatten([neg_words_process(i) for i in doc.sents()]))
    voc = Vocabulary(clipped_corpus, unk_cutoff=3)
    # only keep words with frequency >= 3 as feature
    bag_of_words = {k:v for k,v in voc.counts.items() if v >= voc.cutoff}
    prior = {k:0 for k in classes}
    loglikelihood = {}
    
    for c in classes:
        n_c = len([1 for i in sents if int(i.label()) == c])
        prior[c] = math.log(n_c/n_doc)
        bigdoc[c] = list(flatten([line.leaves() for line in sents if int(line.label()) == c]))
        somme = sum([bigdoc[c].count(w) + 1 for w in bag_of_words.keys()])
        
        for word in bag_of_words.keys():
            count = bigdoc[c].count(word)
            loglikelihood[(word,c)] = math.log((count + 1)/ somme) #sum([bigdoc[c].count(w) + 1 for w in bag_of_words.keys()])
            
    return prior, loglikelihood, bag_of_words

def test_bayes_neg(sentence:list, prior:dict, loglikelihood:dict, classes:list, voc:dict):
    somme = {}
    for c in classes:
        somme[c] = prior[c]
        for word in sentence:
            if word in voc.keys():
                somme[c] += loglikelihood[(word,c)]
    pred =  max(somme, key=somme.get)
    return pred >= 2

def accuracy_neg(voc, test_set, prior, loglik, classes):
    predictions = 0
    success = 0
    parsed_sents = test_set.parsed_sents()
    sents = [list(set(i)) for i in test_set.sents()]
    for i in range(len(sents)):
        predict = test_bayes(sents[i],prior, loglik, classes, voc)
        predictions += 1
        label = 1 if int(parsed_sents[i].label()) >= 2 else 0
            
        if predict == label:
            success += 1
    return success/predictions*100

prior_neg, loglik_neg, v_neg = train_bayes_neg(corpus, [0,1,2,3,4])
accuracy_neg(v_clip, test_corpus, prior_clip, loglik_clip, [0,1,2,3,4])
        
        

66.0

In [32]:
list(flatten([neg_words_process(i) for i in corpus.sents()]))

['A',
 'warm',
 ',',
 'funny',
 ',',
 'engaging',
 'film',
 '.',
 'The',
 'band',
 "'s",
 'courage',
 'in',
 'the',
 'face',
 'of',
 'official',
 'repression',
 'is',
 'inspiring',
 ',',
 'especially',
 'for',
 'aging',
 'hippies',
 '-LRB-',
 'this',
 'one',
 'included',
 '-RRB-',
 '.',
 'Not',
 'NOT_only',
 'NOT_is',
 'NOT_Undercover',
 'NOT_Brother',
 'NOT_as',
 'NOT_funny',
 ',',
 'if',
 'not',
 'NOT_more',
 'NOT_so',
 ',',
 'than',
 'both',
 'Austin',
 'Powers',
 'films',
 ',',
 'but',
 'it',
 "'s",
 'also',
 'one',
 'of',
 'the',
 'smarter',
 ',',
 'savvier',
 'spoofs',
 'to',
 'come',
 'along',
 'in',
 'some',
 'time',
 '.',
 'Woody',
 'Allen',
 "'s",
 'latest',
 'is',
 'an',
 'ambling',
 ',',
 'broad',
 'comedy',
 'about',
 'all',
 'there',
 'is',
 'to',
 'love',
 '--',
 'and',
 'hate',
 '--',
 'about',
 'the',
 'movie',
 'biz',
 '.',
 'The',
 'inhospitability',
 'of',
 'the',
 'land',
 'emphasizes',
 'the',
 'spare',
 'precision',
 'of',
 'the',
 'narratives',
 'and',
 'helps',