## N-gram Models (without using any libraries)

In [152]:
class Ngram:
    def __init__(self, text, n=None):
        self.text = text
        self.n = n
        self.split_text = self.process()

    def process(self):
        text = self.text.lower()
        text = text.replace('.', ' ')
        text = text.replace(',', ' ')
        split_text = text.split()
        return split_text

    def ngram_context(self):
        ngram_list, context_list = [], []
        n = self.n
        for i in range(0, len(self.split_text) - n + 1):
            ngram = self.split_text[i:i + n]
            context = self.split_text[i:i + n - 1]
            context_list.append(context)
            ngram_list.append(ngram)
        return ngram_list, context_list

    def counts(self, ingram):
        ngramcount = 0
        contextcount = 0
        ngram_list = self.ngram_context()[0]
        context_list = self.ngram_context()[1]

        for i in ngram_list:
            if i == ingram:
                ngramcount += 1

        for i in context_list:
            if i == ingram[:-1]:
                contextcount += 1

        return ngramcount, contextcount

    def probability(self, ingram):
        ngram_list, context_list = self.ngram_context()
        ncount, ccount = self.counts(ingram)

        # Check for division by zero
        if ccount == 0:
            return 0.0

        prob = ncount / ccount
        return prob

    def perplexity(self):
        ngram_list = self.ngram_context()
        prob = 1
        for i in ngram_list:
            prob_i = self.probability(i)

            # Check if prob_i is 0.0, and if so, assign a small positive value (e.g., 1e-10)
            if prob_i == 0.0:
                prob_i = 1e-10

            prob = prob * prob_i

        # Check if prob is still 0.0 after the loop
        if prob == 0.0:
            return float('inf')  # Return infinity to indicate undefined perplexity
        else:
            perp = prob ** (-1 / self.n)
            return perp

In [137]:
text = "This is a test. It has some punctuation, like commas and periods."
ngram = Ngram(text)
ngram.split_text 
#assert ngram.split_text == ['this', 'is', 'a', 'test', 'it', 'has', 'some', 'punctuation', 'like', 'commas', 'and', 'periods']

['this',
 'is',
 'a',
 'test',
 'it',
 'has',
 'some',
 'punctuation',
 'like',
 'commas',
 'and',
 'periods']

In [138]:
text = "This is a test. It has some punctuation, like commas and periods."
ngram = Ngram(text, n=3)
ngram_list, context_list = ngram.ngram_context()
print(ngram_list)
print(context_list)
#assert ngram_list == [['this', 'is', 'a'], ['is', 'a', 'test'], ['a', 'test', 'it'], ['test', 'it', 'has'], ['it', 'has', 'some'], ['has', 'some', 'punctuation'], ['some', 'punctuation', 'like'], ['punctuation', 'like', 'commas'], ['like', 'commas', 'and'], ['commas', 'and', 'periods']]
#assert context_list == [['this', 'is'], ['is', 'a'], ['a', 'test'], ['test', 'it'], ['it', 'has'], ['has', 'some'], ['some', 'punctuation'], ['punctuation', 'like'], ['like', 'commas'], ['commas', 'and']]

[['this', 'is', 'a'], ['is', 'a', 'test'], ['a', 'test', 'it'], ['test', 'it', 'has'], ['it', 'has', 'some'], ['has', 'some', 'punctuation'], ['some', 'punctuation', 'like'], ['punctuation', 'like', 'commas'], ['like', 'commas', 'and'], ['commas', 'and', 'periods']]
[['this', 'is'], ['is', 'a'], ['a', 'test'], ['test', 'it'], ['it', 'has'], ['has', 'some'], ['some', 'punctuation'], ['punctuation', 'like'], ['like', 'commas'], ['commas', 'and']]


In [155]:
text = "This is a test. It has some punctuation, this like commas and  this periods."
ngram = Ngram(text, n=2)
ngram_list, context_list = ngram.ngram_context()
print(ngram_list)
print(context_list)
ngramcount  = ngram.counts([ 'this', 'is'])[0]
contextcount = ngram.counts([ 'this', 'is'])[1]
print(ngramcount) 
print(contextcount)
prob = ngram.probability(['this', 'is'])
prob

[['this', 'is'], ['is', 'a'], ['a', 'test'], ['test', 'it'], ['it', 'has'], ['has', 'some'], ['some', 'punctuation'], ['punctuation', 'this'], ['this', 'like'], ['like', 'commas'], ['commas', 'and'], ['and', 'this'], ['this', 'periods']]
[['this'], ['is'], ['a'], ['test'], ['it'], ['has'], ['some'], ['punctuation'], ['this'], ['like'], ['commas'], ['and'], ['this']]
1
3


0.3333333333333333

In [146]:
text = "This is a test. It has some punctuation, like commas and periods."
ngram = Ngram(text, n=2)
prob = ngram.probability(['this', 'is'])
prob 

1.0

In [143]:
text = "This is a test. It has some punctuation, like commas and periods."
ngram = Ngram(text, n=2)
perp = ngram.perplexity()
perp 

10000000000.0

In [100]:
text = "In the vast expanse of the universe, countless galaxies twinkle like distant stars, each harboring its own unique collection of celestial bodies, from massive black holes to shimmering nebulae, all governed by the laws of physics and the mysterious forces that shape the cosmos"

In [101]:
model = Ngram(text,3)

In [102]:
model.process()

['in',
 'the',
 'vast',
 'expanse',
 'of',
 'the',
 'universe',
 'countless',
 'galaxies',
 'twinkle',
 'like',
 'distant',
 'stars',
 'each',
 'harboring',
 'its',
 'own',
 'unique',
 'collection',
 'of',
 'celestial',
 'bodies',
 'from',
 'massive',
 'black',
 'holes',
 'to',
 'shimmering',
 'nebulae',
 'all',
 'governed',
 'by',
 'the',
 'laws',
 'of',
 'physics',
 'and',
 'the',
 'mysterious',
 'forces',
 'that',
 'shape',
 'the',
 'cosmos']

In [103]:
ngram = model.ngram_context()[0]
print(ngram)

[['in', 'the', 'vast'], ['the', 'vast', 'expanse'], ['vast', 'expanse', 'of'], ['expanse', 'of', 'the'], ['of', 'the', 'universe'], ['the', 'universe', 'countless'], ['universe', 'countless', 'galaxies'], ['countless', 'galaxies', 'twinkle'], ['galaxies', 'twinkle', 'like'], ['twinkle', 'like', 'distant'], ['like', 'distant', 'stars'], ['distant', 'stars', 'each'], ['stars', 'each', 'harboring'], ['each', 'harboring', 'its'], ['harboring', 'its', 'own'], ['its', 'own', 'unique'], ['own', 'unique', 'collection'], ['unique', 'collection', 'of'], ['collection', 'of', 'celestial'], ['of', 'celestial', 'bodies'], ['celestial', 'bodies', 'from'], ['bodies', 'from', 'massive'], ['from', 'massive', 'black'], ['massive', 'black', 'holes'], ['black', 'holes', 'to'], ['holes', 'to', 'shimmering'], ['to', 'shimmering', 'nebulae'], ['shimmering', 'nebulae', 'all'], ['nebulae', 'all', 'governed'], ['all', 'governed', 'by'], ['governed', 'by', 'the'], ['by', 'the', 'laws'], ['the', 'laws', 'of'], ['l

In [104]:
context = model.ngram_context()[1]
print(context)

[['in', 'the'], ['the', 'vast'], ['vast', 'expanse'], ['expanse', 'of'], ['of', 'the'], ['the', 'universe'], ['universe', 'countless'], ['countless', 'galaxies'], ['galaxies', 'twinkle'], ['twinkle', 'like'], ['like', 'distant'], ['distant', 'stars'], ['stars', 'each'], ['each', 'harboring'], ['harboring', 'its'], ['its', 'own'], ['own', 'unique'], ['unique', 'collection'], ['collection', 'of'], ['of', 'celestial'], ['celestial', 'bodies'], ['bodies', 'from'], ['from', 'massive'], ['massive', 'black'], ['black', 'holes'], ['holes', 'to'], ['to', 'shimmering'], ['shimmering', 'nebulae'], ['nebulae', 'all'], ['all', 'governed'], ['governed', 'by'], ['by', 'the'], ['the', 'laws'], ['laws', 'of'], ['of', 'physics'], ['physics', 'and'], ['and', 'the'], ['the', 'mysterious'], ['mysterious', 'forces'], ['forces', 'that'], ['that', 'shape'], ['shape', 'the']]


In [105]:
for i in ngram:
    print(f'count of this {i} is {model.counts(i)[0]}')

count of this ['in', 'the', 'vast'] is 1
count of this ['the', 'vast', 'expanse'] is 1
count of this ['vast', 'expanse', 'of'] is 1
count of this ['expanse', 'of', 'the'] is 1
count of this ['of', 'the', 'universe'] is 1
count of this ['the', 'universe', 'countless'] is 1
count of this ['universe', 'countless', 'galaxies'] is 1
count of this ['countless', 'galaxies', 'twinkle'] is 1
count of this ['galaxies', 'twinkle', 'like'] is 1
count of this ['twinkle', 'like', 'distant'] is 1
count of this ['like', 'distant', 'stars'] is 1
count of this ['distant', 'stars', 'each'] is 1
count of this ['stars', 'each', 'harboring'] is 1
count of this ['each', 'harboring', 'its'] is 1
count of this ['harboring', 'its', 'own'] is 1
count of this ['its', 'own', 'unique'] is 1
count of this ['own', 'unique', 'collection'] is 1
count of this ['unique', 'collection', 'of'] is 1
count of this ['collection', 'of', 'celestial'] is 1
count of this ['of', 'celestial', 'bodies'] is 1
count of this ['celestial

In [106]:
for i in context:
    print(f'count of this {i} is {model.counts(i)[1]}')

count of this ['in', 'the'] is 0
count of this ['the', 'vast'] is 0
count of this ['vast', 'expanse'] is 0
count of this ['expanse', 'of'] is 0
count of this ['of', 'the'] is 0
count of this ['the', 'universe'] is 0
count of this ['universe', 'countless'] is 0
count of this ['countless', 'galaxies'] is 0
count of this ['galaxies', 'twinkle'] is 0
count of this ['twinkle', 'like'] is 0
count of this ['like', 'distant'] is 0
count of this ['distant', 'stars'] is 0
count of this ['stars', 'each'] is 0
count of this ['each', 'harboring'] is 0
count of this ['harboring', 'its'] is 0
count of this ['its', 'own'] is 0
count of this ['own', 'unique'] is 0
count of this ['unique', 'collection'] is 0
count of this ['collection', 'of'] is 0
count of this ['of', 'celestial'] is 0
count of this ['celestial', 'bodies'] is 0
count of this ['bodies', 'from'] is 0
count of this ['from', 'massive'] is 0
count of this ['massive', 'black'] is 0
count of this ['black', 'holes'] is 0
count of this ['holes',

In [107]:
model.probability(['mysterious', 'forces', 'that'])

1.0

In [110]:
p = 1
for i in ngram:
    p *= model.probability(i)
print(p)

1.0


In [108]:
model.perplexity()

4641588.8336127745

In [126]:
# Example usage:
training_corpus = "The quick brown fox jumps over the lazy dog. The dog barks loudly in the quiet night. Cats sleep peacefully on the soft couch. Birds sing joyfully in the early morning hours. The sun rises in the east, casting a warm glow over the countryside. The moon shines brightly in the clear night sky, illuminating the landscape below. People go to work in the bustling city, while children play in the colorful park nearby. Cars drive on the busy streets, creating a constant hum of activity. The city never sleeps, with restaurants and shops open throughout the night, offering a wide variety of entertainment and dining options."
n = 3  # Use trigrams
model1 = Ngram(training_corpus, n)

# Calculate probabilities for different n-grams
ngram1 = ["the", "brown", "fox"]
ngram2 = ["jumps", "over", "the"]

probability1 = model1.probability(ngram1)
probability2 = model1.probability(ngram2)

print(f"Probability of '{' '.join(ngram1)}': {probability1}")
print(f"Probability of '{' '.join(ngram2)}': {probability2}")


Probability of 'the brown fox': 0.0
Probability of 'jumps over the': 1.0


In [127]:
ngram11 = model1.ngram_context()[0]
print(ngram11)

[['the', 'quick', 'brown'], ['quick', 'brown', 'fox'], ['brown', 'fox', 'jumps'], ['fox', 'jumps', 'over'], ['jumps', 'over', 'the'], ['over', 'the', 'lazy'], ['the', 'lazy', 'dog'], ['lazy', 'dog', 'the'], ['dog', 'the', 'dog'], ['the', 'dog', 'barks'], ['dog', 'barks', 'loudly'], ['barks', 'loudly', 'in'], ['loudly', 'in', 'the'], ['in', 'the', 'quiet'], ['the', 'quiet', 'night'], ['quiet', 'night', 'cats'], ['night', 'cats', 'sleep'], ['cats', 'sleep', 'peacefully'], ['sleep', 'peacefully', 'on'], ['peacefully', 'on', 'the'], ['on', 'the', 'soft'], ['the', 'soft', 'couch'], ['soft', 'couch', 'birds'], ['couch', 'birds', 'sing'], ['birds', 'sing', 'joyfully'], ['sing', 'joyfully', 'in'], ['joyfully', 'in', 'the'], ['in', 'the', 'early'], ['the', 'early', 'morning'], ['early', 'morning', 'hours'], ['morning', 'hours', 'the'], ['hours', 'the', 'sun'], ['the', 'sun', 'rises'], ['sun', 'rises', 'in'], ['rises', 'in', 'the'], ['in', 'the', 'east'], ['the', 'east', 'casting'], ['east', 'ca

In [128]:
print(model1.ngram_context()[1])

[['the', 'quick'], ['quick', 'brown'], ['brown', 'fox'], ['fox', 'jumps'], ['jumps', 'over'], ['over', 'the'], ['the', 'lazy'], ['lazy', 'dog'], ['dog', 'the'], ['the', 'dog'], ['dog', 'barks'], ['barks', 'loudly'], ['loudly', 'in'], ['in', 'the'], ['the', 'quiet'], ['quiet', 'night'], ['night', 'cats'], ['cats', 'sleep'], ['sleep', 'peacefully'], ['peacefully', 'on'], ['on', 'the'], ['the', 'soft'], ['soft', 'couch'], ['couch', 'birds'], ['birds', 'sing'], ['sing', 'joyfully'], ['joyfully', 'in'], ['in', 'the'], ['the', 'early'], ['early', 'morning'], ['morning', 'hours'], ['hours', 'the'], ['the', 'sun'], ['sun', 'rises'], ['rises', 'in'], ['in', 'the'], ['the', 'east'], ['east', 'casting'], ['casting', 'a'], ['a', 'warm'], ['warm', 'glow'], ['glow', 'over'], ['over', 'the'], ['the', 'countryside'], ['countryside', 'the'], ['the', 'moon'], ['moon', 'shines'], ['shines', 'brightly'], ['brightly', 'in'], ['in', 'the'], ['the', 'clear'], ['clear', 'night'], ['night', 'sky'], ['sky', 'il

In [130]:
model1.counts(ngram1)[0]

0

In [131]:
model1.counts(ngram1)[1]

0

## Ngram with Context 

N-gram-based context probability is calculated by counting the occurrences of specific n-grams in a training corpus and then using these counts to estimate the probability of encountering a part?icular word or sequence of words following a given context. The basic idea is that the more frequently a specific n-gram appears in the training data, the higher its context probability.


Say we want the probability of the word "cat" following the words "The black".
#With context :
Context : "The black" , Target word : 'cat'. 
 For trigram, we look at how many times "The black cat"  appears in large text corpus. If it appears frequently, the context probability of "cat" following "The black" would be relatively high because we've observed this specific sequence often in our training data. Taking the surrounding context into account, and it helps us make more accurate predictions. 

