# Imports


In [None]:
from bs4 import BeautifulSoup
import requests
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Corpus creation

In [None]:
# get URL
page = requests.get("https://ro.wikipedia.org/wiki/O_mie_nouă_sute_optzeci_și_patru_(roman)")

# scrape webpage
soup = BeautifulSoup(page.content, 'html.parser')
list(soup.children)

# write text into a file
f = open("corpus.txt", "ab")
for item in soup.find_all('p'):
    f.write(item.get_text().encode("UTF-8"))
f.close()

# Words extraction

In [None]:
text = open("corpus.txt", "r", encoding="utf-8").read()
words_list = word_tokenize(text)


words = list()
for item in words_list:
  if item.startswith("„"):
    words.append(item[1:]) 
  elif item not in ",.:;!?()„”-'[]–’":
    words.append(item)

print("All the words from the corpus: ")
print(len(words), words, "\n")


unique_words = set(words)
print("All the UNIQUE words from the corpus: ")
print(len(unique_words), unique_words)    

All the words from the corpus: 
4260 ['O', 'mie', 'nouă', 'sute', 'optzeci', 'și', 'patru', 'scris', 'și', '1984', 'este', 'un', 'roman', 'politic', 'creat', 'de', 'George', 'Orwell', 'în', '1948', 'și', 'tipărit', 'în', '1949', 'Acțiunea', 'romanului', 'are', 'loc', 'într-un', 'viitor', 'distopic', 'și', 'prezintă', 'o', 'parte', 'din', 'viața', 'intelectualului', 'Winston', 'Smith', 'sub', 'opresiunea', 'guvernului', 'totalitar', 'al', 'Oceaniei', 'O', 'mie', 'nouă', 'sute', 'optzeci', 'și', 'patru', 'a', 'imprimat', 'foarte', 'mulți', 'termeni', 'și', 'idei', 'în', 'cultura', 'contemporană', 'și', 'mai', 'ales', 'în', 'limba', 'engleză', 'de', 'exemplu', 'Fratele', 'cel', 'Mare', 'dubla', 'gândire', 'poliția', 'gândirii', 'Soceng', 'Fratele', 'cel', 'Mare', 'stă', 'cu', 'ochii', 'pe', 'tine', 'este', 'un', 'simbol', 'al', 'controlului', 'excesiv', 'iar', 'adjectivul', 'orwellian', 'este', 'folosit', 'pentru', 'a', 'descrie', 'acțiunile', 'și', 'organizațiile', 'Oceaniei', 'Principal

# 2-gram language model


In [None]:
list_of_bigrams = []
bigram_counts = {}
unigram_counts = {}

for i in range(len(words) - 1):
  if i < len(words) - 1 and words[i+1].islower():
    list_of_bigrams.append((words[i], words[i + 1]))

    if (words[i], words[i+1]) in bigram_counts:
      bigram_counts[(words[i], words[i + 1])] += 1
    else:
      bigram_counts[(words[i], words[i + 1])] = 1

  if words[i] in unigram_counts:
    unigram_counts[words[i]] += 1
  else:
    unigram_counts[words[i]] = 1
  
print("\n All the possible Bigrams are: ")
print(list_of_bigrams)

print("\n Bigrams along with their frequency: ")
print(bigram_counts)

print("\n Unigrams along with their frequency: ")
print(unigram_counts)



list_of_probabilities = {}

for bigram in list_of_bigrams:
    word1 = bigram[0]
    word2 = bigram[1]
    list_of_probabilities[bigram] = (bigram_counts.get(bigram)) / (unigram_counts.get(word1))

print("\n Bigrams along with their probability: ")
print(list_of_probabilities)


 All the possible Bigrams are: 
[('O', 'mie'), ('mie', 'nouă'), ('nouă', 'sute'), ('sute', 'optzeci'), ('optzeci', 'și'), ('și', 'patru'), ('patru', 'scris'), ('scris', 'și'), ('1984', 'este'), ('este', 'un'), ('un', 'roman'), ('roman', 'politic'), ('politic', 'creat'), ('creat', 'de'), ('Orwell', 'în'), ('1948', 'și'), ('și', 'tipărit'), ('tipărit', 'în'), ('Acțiunea', 'romanului'), ('romanului', 'are'), ('are', 'loc'), ('loc', 'într-un'), ('într-un', 'viitor'), ('viitor', 'distopic'), ('distopic', 'și'), ('și', 'prezintă'), ('prezintă', 'o'), ('o', 'parte'), ('parte', 'din'), ('din', 'viața'), ('viața', 'intelectualului'), ('Smith', 'sub'), ('sub', 'opresiunea'), ('opresiunea', 'guvernului'), ('guvernului', 'totalitar'), ('totalitar', 'al'), ('O', 'mie'), ('mie', 'nouă'), ('nouă', 'sute'), ('sute', 'optzeci'), ('optzeci', 'și'), ('și', 'patru'), ('patru', 'a'), ('a', 'imprimat'), ('imprimat', 'foarte'), ('foarte', 'mulți'), ('mulți', 'termeni'), ('termeni', 'și'), ('și', 'idei'), ('

# Computing the probability for a new text

In [None]:
V = len(unique_words)

input = "They buy a big house"
# input = "Acest articol se referă la romanul 1999."
# input = "O mie nouă sute optzeci și patru"

input_words = input.split()

bigrams = []
for i in range(len(input_words) - 1):
    if i < len(input_words) - 1:
        bigrams.append((input_words[i], input_words[i + 1]))

print("\n The bigrams in given sentence are: ")
print(bigrams)


count = 0
for item in input_words:
  if item not in words:
    count += 1


sentence_probability = 1
if count == 0:
  for i in range(len(bigrams)):
    print('0: ', list_of_probabilities[bigrams[i]])
    sentence_probability *= list_of_probabilities[bigrams[i]]
else:
  for i in range(len(bigrams)):
    if bigrams[i] in list_of_bigrams:
      print('1: ', (bigram_counts[bigrams[i]] + 1) / (unigram_counts[bigrams[i][0]] + V))
      sentence_probability *= (bigram_counts[bigrams[i]] + 1) / (unigram_counts[bigrams[i][0]] + V)
    elif bigrams[i][0] in unigram_counts:
      print('2: ', 1 / (unigram_counts[bigrams[i][0]] + V))
      sentence_probability *= 1 / (unigram_counts[bigrams[i][0]] + V)
    else: 
      print('3: ', 1 / V)
      sentence_probability *= 1 / V 


print('\n' + f'The probablility of sentence: "{input}" is ' + str(sentence_probability))


 The bigrams in given sentence are: 
[('They', 'buy'), ('buy', 'a'), ('a', 'big'), ('big', 'house')]
3:  0.0005630630630630631
3:  0.0005630630630630631
2:  0.0005414185165132648
3:  0.0005630630630630631

The probablility of sentence: "They buy a big house" is 9.665052562286267e-14


# The example from course

In [None]:
words = ['<s>', 'there', 'is', 'a', 'big', 'house', '</s>', '<s>', 'i', 'buy', 
         'a', 'house', '</s>', '<s>', 'they', 'buy', 'the', 'new', 'house', '</s>']

list_of_bigrams = []
bigram_counts = {}
unigram_counts = {}

for i in range(len(words) - 1):
  if i < len(words) - 1 and words[i+1].islower():
    list_of_bigrams.append((words[i], words[i + 1]))

    if (words[i], words[i+1]) in bigram_counts:
      bigram_counts[(words[i], words[i + 1])] += 1
    else:
      bigram_counts[(words[i], words[i + 1])] = 1

  if words[i] in unigram_counts:
    unigram_counts[words[i]] += 1
  else:
    unigram_counts[words[i]] = 1

print("\n All the possible Bigrams are: ")
print(list_of_bigrams)

print("\n Bigrams along with their frequency: ")
print(bigram_counts)

print("\n Unigrams along with their frequency: ")
print(unigram_counts)


list_of_probabilities = {}

for bigram in list_of_bigrams:
    word1 = bigram[0]
    word2 = bigram[1]
    list_of_probabilities[bigram] = (bigram_counts.get(bigram)) / (unigram_counts.get(word1))

print("\n Bigrams along with their probability: ")
print(list_of_probabilities)


V = 10

# input = "<s> they buy a big house </s>"
input = "<s> they buy a red house </s>"

input_words = input.split()
sentence_probability = 1
bigrams = []

for i in range(len(input_words) - 1):
    if i < len(input_words) - 1:
        bigrams.append((input_words[i], input_words[i + 1]))

print("\n The bigrams in given sentence are: ")
print(bigrams)


count = 0
for item in input_words:
  if item not in words:
    count += 1

if count == 0:
  for i in range(len(bigrams)):
    print('0', list_of_probabilities[bigrams[i]])
    sentence_probability *= list_of_probabilities[bigrams[i]]
else:
  for i in range(len(bigrams)):
    if bigrams[i] in list_of_bigrams:
      print('1', (bigram_counts[bigrams[i]] + 1) / (unigram_counts[bigrams[i][0]] + V))
      sentence_probability *= (bigram_counts[bigrams[i]] + 1) / (unigram_counts[bigrams[i][0]] + V)
    elif bigrams[i][0] in unigram_counts:
      print('2', 1 / (unigram_counts[bigrams[i][0]] + V))
      sentence_probability *= 1 / (unigram_counts[bigrams[i][0]] + V)
    else: 
      print('3', 1 / V)
      sentence_probability *= 1 / V 

print('\n' + f'The probablility of sentence: "{input}" is ' + str(sentence_probability))


 All the possible Bigrams are: 
[('<s>', 'there'), ('there', 'is'), ('is', 'a'), ('a', 'big'), ('big', 'house'), ('house', '</s>'), ('</s>', '<s>'), ('<s>', 'i'), ('i', 'buy'), ('buy', 'a'), ('a', 'house'), ('house', '</s>'), ('</s>', '<s>'), ('<s>', 'they'), ('they', 'buy'), ('buy', 'the'), ('the', 'new'), ('new', 'house'), ('house', '</s>')]

 Bigrams along with their frequency: 
{('<s>', 'there'): 1, ('there', 'is'): 1, ('is', 'a'): 1, ('a', 'big'): 1, ('big', 'house'): 1, ('house', '</s>'): 3, ('</s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'buy'): 1, ('buy', 'a'): 1, ('a', 'house'): 1, ('<s>', 'they'): 1, ('they', 'buy'): 1, ('buy', 'the'): 1, ('the', 'new'): 1, ('new', 'house'): 1}

 Unigrams along with their frequency: 
{'<s>': 3, 'there': 1, 'is': 1, 'a': 2, 'big': 1, 'house': 3, '</s>': 2, 'i': 1, 'buy': 2, 'they': 1, 'the': 1, 'new': 1}

 Bigrams along with their probability: 
{('<s>', 'there'): 0.3333333333333333, ('there', 'is'): 1.0, ('is', 'a'): 1.0, ('a', 'big'): 0.5, ('big'