In [2]:
import pandas as pd
import nltk, re, pprint, string
from nltk import word_tokenize, sent_tokenize
from prettytable import PrettyTable
from nltk import FreqDist, bigrams
from collections import Counter

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
corpus = "You book a flight. I read a book. You read!"
corpus = "eos " + corpus

corpus = corpus.lower()
corpus = corpus.replace(",", "").replace(".", " eos").replace("!", " eos").replace("?", " eos").replace(":", "").replace(";", "").replace("-", "").replace("'s", "").replace("'", "").replace('"', "")
print(corpus)


eos you book a flight eos i read a book eos you read eos


In [5]:
tokens = word_tokenize(corpus)
print(tokens)
unique_words = [""] + tokens
unique_words = pd.unique(unique_words)
print(unique_words)

['eos', 'you', 'book', 'a', 'flight', 'eos', 'i', 'read', 'a', 'book', 'eos', 'you', 'read', 'eos']
['' 'eos' 'you' 'book' 'a' 'flight' 'i' 'read']


In [6]:
bigram_freq = FreqDist(tokens)
feq_table = PrettyTable()
feq_table.field_names = ["Word","Frequency"]
for word in unique_words:
  if word == 'eos' or word == '':
    continue
  feq_table.add_row([word,bigram_freq[word]])

print("Frequency Table")
print(feq_table)

Frequency Table
+--------+-----------+
|  Word  | Frequency |
+--------+-----------+
|  you   |     2     |
|  book  |     2     |
|   a    |     2     |
| flight |     1     |
|   i    |     1     |
|  read  |     2     |
+--------+-----------+


In [8]:
def generate_bigrams(corpus):
    tokens = word_tokenize(corpus.lower())
    bigram_list = list(bigrams(tokens))
    return bigram_list

def calculate_bigram_frequencies(bigram_list):
    bigram_counts = Counter(bigram_list)
    return bigram_counts

all_bigrams = generate_bigrams(corpus)
bigram_frequencies = calculate_bigram_frequencies(all_bigrams)
print(all_bigrams)
bi_feq_table = PrettyTable()
bi_feq_table.field_names = ["Bigram","Frequency"]
unique_bigrams = set(all_bigrams)
for bigram in unique_bigrams:
  bi_feq_table.add_row([bigram,bigram_frequencies[bigram]])

print("Bigram Frequency Table")
print(bi_feq_table)

[('eos', 'you'), ('you', 'book'), ('book', 'a'), ('a', 'flight'), ('flight', 'eos'), ('eos', 'i'), ('i', 'read'), ('read', 'a'), ('a', 'book'), ('book', 'eos'), ('eos', 'you'), ('you', 'read'), ('read', 'eos')]
Bigram Frequency Table
+-------------------+-----------+
|       Bigram      | Frequency |
+-------------------+-----------+
|    ('eos', 'i')   |     1     |
|  ('you', 'read')  |     1     |
|   ('a', 'book')   |     1     |
|   ('book', 'a')   |     1     |
| ('flight', 'eos') |     1     |
|   ('read', 'a')   |     1     |
|  ('book', 'eos')  |     1     |
|   ('eos', 'you')  |     2     |
|  ('a', 'flight')  |     1     |
|  ('read', 'eos')  |     1     |
|  ('you', 'book')  |     1     |
|   ('i', 'read')   |     1     |
+-------------------+-----------+


In [14]:
def calculate_bigram_probabilities(bigram_list):
    unique_bigrams = set(bigram_list)
    total_bigrams = len(bigram_list)
    bigram_probabilities = {bigram: (bigram_frequencies[bigram]) / (bigram_freq[bigram[0]])
                           for bigram in unique_bigrams}
    return bigram_probabilities

print("Bigram Probability Table:")
bigram_probabilities = calculate_bigram_probabilities(unique_bigrams)
for bigram, prob in bigram_probabilities.items():
    print(f"{bigram}: {prob:.4f}")

Bigram Probability Table:
('eos', 'i'): 0.2500
('you', 'read'): 0.5000
('a', 'book'): 0.5000
('book', 'a'): 0.5000
('flight', 'eos'): 1.0000
('read', 'a'): 0.5000
('book', 'eos'): 0.5000
('eos', 'you'): 0.5000
('a', 'flight'): 0.5000
('read', 'eos'): 0.5000
('you', 'book'): 0.5000
('i', 'read'): 1.0000


In [11]:
table = PrettyTable()
unique_words=[""]+tokens
unique_words=pd.unique(unique_words)
print(unique_words)
h = {}
for token in tokens:
  h[token] = tokens.count(token)
print(h)
table.field_names=[i for i in unique_words]
unique_words=unique_words[1:]
for i in unique_words:
  row=[i]
  for j in unique_words:
    count=0
    for k in range(len(tokens)-1):
      if tokens[k]==i and tokens[k+1]==j:
        count+=1
    row.append(count/h[i])
  table.add_row(row)
print(table)

['' 'eos' 'you' 'book' 'a' 'flight' 'i' 'read']
{'eos': 4, 'you': 2, 'book': 2, 'a': 2, 'flight': 1, 'i': 1, 'read': 2}
+--------+-----+-----+------+-----+--------+------+------+
|        | eos | you | book |  a  | flight |  i   | read |
+--------+-----+-----+------+-----+--------+------+------+
|  eos   | 0.0 | 0.5 | 0.0  | 0.0 |  0.0   | 0.25 | 0.0  |
|  you   | 0.0 | 0.0 | 0.5  | 0.0 |  0.0   | 0.0  | 0.5  |
|  book  | 0.5 | 0.0 | 0.0  | 0.5 |  0.0   | 0.0  | 0.0  |
|   a    | 0.0 | 0.0 | 0.5  | 0.0 |  0.5   | 0.0  | 0.0  |
| flight | 1.0 | 0.0 | 0.0  | 0.0 |  0.0   | 0.0  | 0.0  |
|   i    | 0.0 | 0.0 | 0.0  | 0.0 |  0.0   | 0.0  | 1.0  |
|  read  | 0.5 | 0.0 | 0.0  | 0.5 |  0.0   | 0.0  | 0.0  |
+--------+-----+-----+------+-----+--------+------+------+


In [12]:
table = PrettyTable()
unique_words=[""]+tokens
unique_words=pd.unique(unique_words)
print(unique_words)
h = {}
for token in tokens:
  h[token] = tokens.count(token)
print(h)
table.field_names=[i for i in unique_words]
unique_words=unique_words[1:]
for i in unique_words:
  row=[i]
  for j in unique_words:
    count=0
    for k in range(len(tokens)-1):
      if tokens[k]==i and tokens[k+1]==j:
        count+=1
    row.append(round((count+1)/(h[i] + len(unique_words)),2))
  table.add_row(row)
print(table)

['' 'eos' 'you' 'book' 'a' 'flight' 'i' 'read']
{'eos': 4, 'you': 2, 'book': 2, 'a': 2, 'flight': 1, 'i': 1, 'read': 2}
+--------+------+------+------+------+--------+------+------+
|        | eos  | you  | book |  a   | flight |  i   | read |
+--------+------+------+------+------+--------+------+------+
|  eos   | 0.09 | 0.27 | 0.09 | 0.09 |  0.09  | 0.18 | 0.09 |
|  you   | 0.11 | 0.11 | 0.22 | 0.11 |  0.11  | 0.11 | 0.22 |
|  book  | 0.22 | 0.11 | 0.11 | 0.22 |  0.11  | 0.11 | 0.11 |
|   a    | 0.11 | 0.11 | 0.22 | 0.11 |  0.22  | 0.11 | 0.11 |
| flight | 0.25 | 0.12 | 0.12 | 0.12 |  0.12  | 0.12 | 0.12 |
|   i    | 0.12 | 0.12 | 0.12 | 0.12 |  0.12  | 0.12 | 0.25 |
|  read  | 0.22 | 0.11 | 0.11 | 0.22 |  0.11  | 0.11 | 0.11 |
+--------+------+------+------+------+--------+------+------+


In [32]:
def N_Grams_smoothing(bigram_list):
    bigram_freq = FreqDist(bigram_list)
    # Add-one smoothing
    unique_bigrams = set(bigram_list)
    total_bigrams = len(bigram_list)
    bigram_probabilities = {bigram: (bigram_freq[bigram] + 1) / (total_bigrams + len(unique_bigrams))
                           for bigram in unique_bigrams}

    return bigram_probabilities

In [35]:
def calculate_sentence_probability(sentence, bigram_probabilities):
    probability = 1.0
    for bigram in sentence:
        if bigram_frequencies[bigram] == 0:
          bigram_probabilities = N_Grams_smoothing(list(unique_bigrams) +sentence)

        probability *= bigram_probabilities[bigram]

    return probability

In [30]:
test_sentence = "you read a book."
test_sentence = test_sentence.lower()

test_sentence_pro = test_sentence.replace(",", "").replace(".", " eos").replace("!", " eos").replace("?", " eos").replace(":", "").replace(";", "").replace("-", "").replace("'s", "").replace("'", "").replace('"', "")
test_sentence_pro = "eos " + test_sentence_pro
tokenized_test_sentence = word_tokenize(test_sentence_pro)
test_sentence_bigrams = list(bigrams(tokenized_test_sentence))

# Calculate probability of the test sentence
probability = calculate_sentence_probability(test_sentence_bigrams, bigram_probabilities)

# Display calculated probability for the test sentence
print(f"\nThe probability of the sentence '{test_sentence}' is: {probability:.6f}")


The probability of the sentence 'you read a book.' is: 0.031250


In [36]:
test_sentence = "Are you in hosptial."
test_sentence = test_sentence.lower()

test_sentence_pro = test_sentence.replace(",", "").replace(".", " eos").replace("!", " eos").replace("?", " eos").replace(":", "").replace(";", "").replace("-", "").replace("'s", "").replace("'", "").replace('"', "")
test_sentence_pro = "eos " + test_sentence_pro
tokenized_test_sentence = word_tokenize(test_sentence_pro)
test_sentence_bigrams = list(bigrams(tokenized_test_sentence))

# Calculate probability of the test sentence
probability = calculate_sentence_probability(test_sentence_bigrams, bigram_probabilities)

# Display calculated probability for the test sentence
print(f"\nThe probability of the sentence '{test_sentence}' is: {probability:.6f}")


The probability of the sentence 'are you in hosptial.' is: 0.000001
