In [1]:
import sys
from collections import defaultdict
import math
import random
import os
import os.path
from collections import deque, Counter
"""
COMS W4705 - Natural Language Processing - Fall 2019
Homework 1 - Programming Component: Trigram Language Models
Yassine Benajiba
"""

'\nCOMS W4705 - Natural Language Processing - Fall 2019\nHomework 1 - Programming Component: Trigram Language Models\nYassine Benajiba\n'

In [2]:
def corpus_reader(corpusfile, lexicon=None): 
    with open(corpusfile,'r') as corpus: 
        for line in corpus: 
            if line.strip():
                sequence = line.lower().strip().split()
                if lexicon: 
                    yield [word if word in lexicon else "UNK" for word in sequence]
                else: 
                    yield sequence

def get_lexicon(corpus):
    word_counts = defaultdict(int)
    for sentence in corpus:
        for word in sentence: 
            word_counts[word] += 1
    return set(word for word in word_counts if word_counts[word] > 1)  



def get_ngrams(sequence, n):
    """
    COMPLETE THIS FUNCTION (PART 1)
    Given a sequence, this function should return a list of n-grams, where each n-gram is a Python tuple.
    This should work for arbitrary values of 1 <= n < len(sequence).
    """
    sequence = deque(sequence)
    if n>1:
        sequence.extendleft(['START']*(n-1))
    else:
        sequence.extendleft(['START'])
    sequence.extend(['STOP'])
    sequence = list(sequence)
    ngrams = []
    for seq in range(0,len(sequence)-(n-1)):
        ngrams.append(tuple(sequence[seq:seq+n]))
    return ngrams

In [88]:
class TrigramModel(object):
    
    def __init__(self, corpusfile):
    
        # Iterate through the corpus once to build a lexicon 
        generator = corpus_reader(corpusfile)
        self.lexicon = get_lexicon(generator)
        self.lexicon.add("UNK")
        self.lexicon.add("START")
        self.lexicon.add("STOP")
    
        # Now iterate through the corpus again and count ngrams
        generator = corpus_reader(corpusfile, self.lexicon)
        self.count_ngrams(generator)
    
    def count_ngrams(self, corpus):
        """
        COMPLETE THIS METHOD (PART 2)
        Given a corpus iterator, populate dictionaries of unigram, bigram,
        and trigram counts. 
        """
        self.unigramcounts = Counter()
        self.bigramcounts = Counter()
        self.trigramcounts = Counter()
        self.total_words = 0
        for ind, sentence in enumerate(corpus):
            if ind > 1000:
                break
#             print("sentence :", ind)
            self.unigramcounts+= Counter(get_ngrams(sentence,1))
            self.bigramcounts+= Counter(get_ngrams(sentence,2))
            self.trigramcounts+= Counter(get_ngrams(sentence,3))
        self.total_words+= sum(self.unigramcounts.values())
        return self.unigramcounts, self.bigramcounts, self.trigramcounts, self.total_words
    
    def raw_trigram_probability(self,trigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) trigram probability
        """
        print("trigram got is",trigram)
        try:
            return round(self.trigramcounts[trigram] / self.bigramcounts[trigram[0:2]],2)
        except Exception as e:
            return 0
        
    def raw_bigram_probability(self, bigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) bigram probability
        """
        try:
            return round(self.bigramcounts[bigram]/self.unigramcounts[bigram[0]],2) #no backing off to unigram probability
        except Exception as e:
            return 0
        
    def raw_unigram_probability(self, unigram):
        """
        COMPLETE THIS METHOD (PART 3)
        Returns the raw (unsmoothed) unigram probability.
        """
        #hint: recomputing the denominator every time the method is called
        # can be slow! You might want to compute the total number of words once, 
        # store in the TrigramModel instance, and then re-use it.  
        try:
            return round(self.unigramcounts[unigram]/self.total_words,2)
        except Exception as e:
            return 0

    def generate_sentence(self,t=20): 
        """
        COMPLETE THIS METHOD (OPTIONAL)
        Generate a random sentence from the trigram model. t specifies the
        max length, but the sentence may be shorter if STOP is reached.
        """
        return result            

    def smoothed_trigram_probability(self, trigram):
        """
        COMPLETE THIS METHOD (PART 4)
        Returns the smoothed trigram probability (using linear interpolation). 
        """
        lambda1 = 1
        lambda2 = 1/3.0
        lambda3 = 1/3.0
        return lambda1 * self.raw_trigram_probability(trigram) #+ lambda2 * self.raw_bigram_probability(trigram[0:2]) + lambda3 * self.raw_unigram_probability(trigram[0])
        
    def sentence_logprob(self, sentence):
        """
        COMPLETE THIS METHOD (PART 5)
        Returns the log probability of an entire sequence.
        """
        trigram = get_ngrams(sentence,3)
        print("CHECK TRIGRAM",trigram)
        sent_prob = 0
        for tri in trigram:
            try:
                sent_prob+= math.log2(self.smoothed_trigram_probability(tri))
            except ValueError:
                sent_prob+=0
        return sent_prob

    def perplexity(self, corpus):
        """
        COMPLETE THIS METHOD (PART 6) 
        Returns the log probability of an entire sequence.
        """
        sent_logprob = 0
        for ind, sentence in enumerate(corpus):
            if ind>100:
                break
            try:
                sent_logprob+=math.log2(self.sentence_logprob(sentence))
            except ValueError:
                sent_logprob+=0
        norm_sent_logprob = sent_logprob/self.total_words
        return 2**(-norm_sent_logprob)


def essay_scoring_experiment(training_file1, training_file2, testdir1, testdir2):

        model1 = TrigramModel(training_file1)
        model2 = TrigramModel(training_file2)

        total = 0
        correct = 0       
 
        for f in os.listdir(testdir1):
            pp = model1.perplexity(corpus_reader(os.path.join(testdir1, f), model1.lexicon))
            # .. 
    
        for f in os.listdir(testdir2):
            pp = model2.perplexity(corpus_reader(os.path.join(testdir2, f), model2.lexicon))
            # .. 
        
        return 0.0

In [91]:
if __name__ == "__main__":

#     model = TrigramModel(sys.argv[1]) 
    model = TrigramModel("hw1_data/brown_train.txt")

    print(model.perplexity(corpus_reader("hw1_data/brown_train.txt",model.lexicon)))
    
#     print(model.unigramcounts)
#     print(model.unigramcounts[('the',)])
#     print(model.raw_unigram_probability(['the',]))

    # put test code here...
    # or run the script from the command line with 
    # $ python -i trigram_model.py [corpus_file]
    # >>> 
    #
    # you can then call methods on the model instance in the interactive 
    # Python prompt. 

#     dev_corpus = corpus_reader("hw1_data/brown_test.txt",model.lexicon)
#     print(model.perplexity(dev_corpus))

    # Testing perplexity: 
    # dev_corpus = corpus_reader(sys.argv[2], model.lexicon)
    # pp = model.perplexity(dev_corpus)
    # print(pp)


    # Essay scoring experiment: 
    # acc = essay_scoring_experiment('train_high.txt', 'train_low.txt", "test_high", "test_low")
    # print(acc)

CHECK TRIGRAM [('START', 'START', 'the'), ('START', 'the', 'fulton'), ('the', 'fulton', 'county'), ('fulton', 'county', 'grand'), ('county', 'grand', 'jury'), ('grand', 'jury', 'said'), ('jury', 'said', 'friday'), ('said', 'friday', 'an'), ('friday', 'an', 'investigation'), ('an', 'investigation', 'of'), ('investigation', 'of', 'atlanta'), ('of', 'atlanta', "'s"), ('atlanta', "'s", 'recent'), ("'s", 'recent', 'primary'), ('recent', 'primary', 'election'), ('primary', 'election', 'produced'), ('election', 'produced', '``'), ('produced', '``', 'no'), ('``', 'no', 'evidence'), ('no', 'evidence', "''"), ('evidence', "''", 'that'), ("''", 'that', 'any'), ('that', 'any', 'irregularities'), ('any', 'irregularities', 'took'), ('irregularities', 'took', 'place'), ('took', 'place', '.'), ('place', '.', 'STOP')]
trigram got is ('START', 'START', 'the')
trigram got is ('START', 'the', 'fulton')
trigram got is ('the', 'fulton', 'county')
trigram got is ('fulton', 'county', 'grand')
trigram got is (

trigram got is ('in', 'our', 'less')
trigram got is ('our', 'less', 'populous')
trigram got is ('less', 'populous', 'counties')
trigram got is ('populous', 'counties', "''")
trigram got is ('counties', "''", '.')
trigram got is ("''", '.', 'STOP')
CHECK TRIGRAM [('START', 'START', 'nevertheless'), ('START', 'nevertheless', ','), ('nevertheless', ',', '``'), (',', '``', 'we'), ('``', 'we', 'feel'), ('we', 'feel', 'that'), ('feel', 'that', 'in'), ('that', 'in', 'the'), ('in', 'the', 'future'), ('the', 'future', 'fulton'), ('future', 'fulton', 'county'), ('fulton', 'county', 'should'), ('county', 'should', 'receive'), ('should', 'receive', 'some'), ('receive', 'some', 'portion'), ('some', 'portion', 'of'), ('portion', 'of', 'these'), ('of', 'these', 'available'), ('these', 'available', 'funds'), ('available', 'funds', "''"), ('funds', "''", ','), ("''", ',', 'the'), (',', 'the', 'jurors'), ('the', 'jurors', 'said'), ('jurors', 'said', '.'), ('said', '.', 'STOP')]
trigram got is ('START', 

trigram got is ('race', ',', 'a')
trigram got is (',', 'a', 'top')
trigram got is ('a', 'top', 'official')
trigram got is ('top', 'official', 'said')
trigram got is ('official', 'said', 'wednesday')
trigram got is ('said', 'wednesday', '.')
trigram got is ('wednesday', '.', 'STOP')
CHECK TRIGRAM [('START', 'START', 'robert'), ('START', 'robert', 'snodgrass'), ('robert', 'snodgrass', ','), ('snodgrass', ',', 'state'), (',', 'state', 'gop'), ('state', 'gop', 'chairman'), ('gop', 'chairman', ','), ('chairman', ',', 'said'), (',', 'said', 'a'), ('said', 'a', 'meeting'), ('a', 'meeting', 'held'), ('meeting', 'held', 'tuesday'), ('held', 'tuesday', 'night'), ('tuesday', 'night', 'in'), ('night', 'in', 'blue'), ('in', 'blue', 'ridge'), ('blue', 'ridge', 'brought'), ('ridge', 'brought', 'enthusiastic'), ('brought', 'enthusiastic', 'responses'), ('enthusiastic', 'responses', 'from'), ('responses', 'from', 'the'), ('from', 'the', 'audience'), ('the', 'audience', '.'), ('audience', '.', 'STOP')]


trigram got is ('the', 'house', 'in')
trigram got is ('house', 'in', 'a')
trigram got is ('in', 'a', 'privilege')
trigram got is ('a', 'privilege', 'resolution')
trigram got is ('privilege', 'resolution', 'to')
trigram got is ('resolution', 'to', '``')
trigram got is ('to', '``', 'endorse')
trigram got is ('``', 'endorse', 'increased')
trigram got is ('endorse', 'increased', 'federal')
trigram got is ('increased', 'federal', 'support')
trigram got is ('federal', 'support', 'for')
trigram got is ('support', 'for', 'public')
trigram got is ('for', 'public', 'education')
trigram got is ('public', 'education', ',')
trigram got is ('education', ',', 'provided')
trigram got is (',', 'provided', 'that')
trigram got is ('provided', 'that', 'such')
trigram got is ('that', 'such', 'funds')
trigram got is ('such', 'funds', 'be')
trigram got is ('funds', 'be', 'received')
trigram got is ('be', 'received', 'and')
trigram got is ('received', 'and', 'expended')
trigram got is ('and', 'expended', "''"

1.0


In [90]:
model.trigramcounts

Counter({('START', 'START', 'the'): 196,
         ('START', 'the', 'fulton'): 1,
         ('the', 'fulton', 'county'): 3,
         ('fulton', 'county', 'grand'): 1,
         ('county', 'grand', 'jury'): 1,
         ('grand', 'jury', 'said'): 1,
         ('jury', 'said', 'friday'): 1,
         ('said', 'friday', 'an'): 1,
         ('friday', 'an', 'investigation'): 1,
         ('an', 'investigation', 'of'): 1,
         ('investigation', 'of', 'atlanta'): 1,
         ('of', 'atlanta', "'s"): 1,
         ('atlanta', "'s", 'recent'): 1,
         ("'s", 'recent', 'primary'): 1,
         ('recent', 'primary', 'election'): 1,
         ('primary', 'election', 'produced'): 1,
         ('election', 'produced', '``'): 1,
         ('produced', '``', 'no'): 1,
         ('``', 'no', 'evidence'): 1,
         ('no', 'evidence', "''"): 1,
         ('evidence', "''", 'that'): 1,
         ("''", 'that', 'any'): 1,
         ('that', 'any', 'irregularities'): 1,
         ('any', 'irregularities', 'took'): 

In [45]:
model.smoothed_trigram_probability(('START', 'START', 'the'))

TRIGRAM IS:  ('START', 'START', 'the')


0.019999999999999997

In [81]:
model.raw_trigram_probability(('the', 'fulton', 'county'))

trigram got is ('the', 'fulton', 'county')


0.5

In [61]:
model.bigramcounts[('the','fulton')]

6

In [66]:
model.trigramcounts[('the', 'fulton', 'county')]

3

In [57]:
model.trigramcounts

Counter({('START', 'START', 'the'): 196,
         ('START', 'the', 'fulton'): 1,
         ('the', 'fulton', 'county'): 3,
         ('fulton', 'county', 'grand'): 1,
         ('county', 'grand', 'jury'): 1,
         ('grand', 'jury', 'said'): 1,
         ('jury', 'said', 'friday'): 1,
         ('said', 'friday', 'an'): 1,
         ('friday', 'an', 'investigation'): 1,
         ('an', 'investigation', 'of'): 1,
         ('investigation', 'of', 'atlanta'): 1,
         ('of', 'atlanta', "'s"): 1,
         ('atlanta', "'s", 'recent'): 1,
         ("'s", 'recent', 'primary'): 1,
         ('recent', 'primary', 'election'): 1,
         ('primary', 'election', 'produced'): 1,
         ('election', 'produced', '``'): 1,
         ('produced', '``', 'no'): 1,
         ('``', 'no', 'evidence'): 1,
         ('no', 'evidence', "''"): 1,
         ('evidence', "''", 'that'): 1,
         ("''", 'that', 'any'): 1,
         ('that', 'any', 'irregularities'): 1,
         ('any', 'irregularities', 'took'): 

CHECK TRIGRAM [('START', 'START', ('START', 'START', 'the')), ('START', ('START', 'START', 'the'), ('START', 'the', 'fulton')), (('START', 'START', 'the'), ('START', 'the', 'fulton'), ('the', 'fulton', 'county')), (('START', 'the', 'fulton'), ('the', 'fulton', 'county'), ('fulton', 'county', 'grand')), (('the', 'fulton', 'county'), ('fulton', 'county', 'grand'), ('county', 'grand', 'jury')), (('fulton', 'county', 'grand'), ('county', 'grand', 'jury'), ('grand', 'jury', 'said')), (('county', 'grand', 'jury'), ('grand', 'jury', 'said'), ('jury', 'said', 'friday')), (('grand', 'jury', 'said'), ('jury', 'said', 'friday'), ('said', 'friday', 'an')), (('jury', 'said', 'friday'), ('said', 'friday', 'an'), ('friday', 'an', 'investigation')), (('said', 'friday', 'an'), ('friday', 'an', 'investigation'), ('an', 'investigation', 'of')), (('friday', 'an', 'investigation'), ('an', 'investigation', 'of'), ('investigation', 'of', 'atlanta')), (('an', 'investigation', 'of'), ('investigation', 'of', 'a

0

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amogh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
from nltk.util import ngrams

In [7]:
text = "I am aware that nltk only offers bigrams and trigrams, but is there a way to split my text in four-grams, five-grams or even hundred-grams"
tokenize = nltk.word_tokenize(text)

In [8]:
tokenize = nltk.word_tokenize(text)

In [14]:
bigrams = list(ngrams(tokenize,1))
bigrams

[('I',),
 ('am',),
 ('aware',),
 ('that',),
 ('nltk',),
 ('only',),
 ('offers',),
 ('bigrams',),
 ('and',),
 ('trigrams',),
 (',',),
 ('but',),
 ('is',),
 ('there',),
 ('a',),
 ('way',),
 ('to',),
 ('split',),
 ('my',),
 ('text',),
 ('in',),
 ('four-grams',),
 (',',),
 ('five-grams',),
 ('or',),
 ('even',),
 ('hundred-grams',)]