In [9]:
import math
import random
from collections import Counter

start = "<s>"
end = "</s>"
unk = "<unk>"
class LanguageModel:
    # Initialize and train the model (ie, estimate the model's underlying probability
    # distribution from the training corpus)
    def __init__(self, corpus):
        #     print("""Your task is to implement four kinds of n-gram language models:
        #   a) an (unsmoothed) unigram model (UnigramModel)
        #   b) a unigram model smoothed using Laplace smoothing (SmoothedUnigramModel)
        #   c) an unsmoothed bigram model (BigramModel)
        #   d) a bigram model smoothed using linear interpolation smoothing (SmoothedBigramModelInt)
        #   """)
        self.corpus = corpus
    # enddef

    # Generate a sentence by drawing words according to the
    # model's probability distribution
    # Note: think about how to set the length of the sentence
    # in a principled way
    def generateSentence(self):
        print("Implement the generateSentence method in each subclass")
        return "mary had a little lamb ."
    # emddef

    # Given a sentence (sen), return the probability of
    # that sentence under the model
    def getSentenceProbability(self, sen):
        print("Implement the getSentenceProbability method in each subclass")
        return 0.0
    # enddef

    # Given a corpus, calculate and return its perplexity
    # (normalized inverse log probability)
    def getCorpusPerplexity(self, corpus):
        print("Implement the getCorpusPerplexity method")
        return 0.0
    # enddef

    # Given a file (filename) and the number of sentences, generate a list
    # of sentences and write each to file along with its model probability.
    # Note: you shouldn't need to change this method
    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0, numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)


class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.count = Counter(
            [word for sentence in self.corpus for word in sentence])

    def generateSentence(self):
        sentence = [start]

        while True:
            word = random.choice(list(self.count.keys()))
            if word == end:
                break
            sentence.append(word)
            if len(sentence) > 20:
                break

        return sentence[1:]

    def getSentenceProbability(self, sen):
        log_probability = 0.0  # Initialize probability to 0 since it's log probability

        for word in sen:
            log_probability += math.log(
                self.count[word] / sum(self.count.values()))

        return log_probability

def getCorpusPerplexity(self, corpus):
    total_words = 0
    log_probability = 0.0

    for sentence in corpus:
        total_words += len(sentence)
        log_probability += self.getSentenceProbability(sentence)

    return pow(log_probability, -1 / total_words)

In [10]:
# Example corpus
example_corpus = [
    ["<s>", "apple", "is", "a", "fruit", "</s>"],
    ["<s>", "banana", "is", "yellow", "</s>"],
    ["<s>", "orange", "is", "citrus", "</s>"]
]

# Create an instance of UnigramModel
unigram_model = UnigramModel(example_corpus)

# Generate a sentence using the model
generated_sentence = unigram_model.generateSentence()
print("Generated Sentence:", generated_sentence)

# Calculate the probability of the generated sentence
sentence_probability = unigram_model.getSentenceProbability(generated_sentence)
print("Probability of Generated Sentence:", sentence_probability)

# Calculate the perplexity of the entire corpus
corpus_perplexity = unigram_model.getCorpusPerplexity(example_corpus)
print("Corpus Perplexity:", corpus_perplexity)


Generated Sentence: ['orange', 'apple', 'yellow', 'fruit', '<s>', 'yellow', 'orange', '<s>', '<s>', 'citrus', '<s>']
Probability of Generated Sentence: -26.104026789965154
Implement the getCorpusPerplexity method
Corpus Perplexity: 0.0
