[Reference](https://medium.com/geekculture/nlp-building-a-basic-automated-text-filler-an-introduction-d560ac2b5cdf)

In [1]:
from bs4 import BeautifulSoup
from nltk.util import ngrams
from collections import defaultdict
from nltk import trigrams
from nltk.tokenize import RegexpTokenizer
import requests

#load fetch speech text from blog
response = requests.get("https://maxsiollun.wordpress.com/great-speeches-in-nigerias-history/")
soup = BeautifulSoup(response.text,'html.parser')
sentence = soup.find_all('p',text=True)
print(sentence[1:3])

[<p>Violence has never been an instrument used by us, as founding fathers of the Nigerian Republic, to solve political problems. In the British tradition, we talked the Colonial Office into accepting our challenges for the demerits and merits of our case for self-government.  After six constitutional conferences in 1953, 1954, 1957, 1958, 1959, and 1960, Great Britain conceded to us the right to assert our political independence as from October 1, 1960.  None of the Nigerian political parties ever adopted violent means to gain our political freedom and we are happy to claim that not a drop of British or Nigerian blood was shed in the course of our national struggle for our place in the sun. This historical fact enabled me to state publicly in Nigeria that Her Majesty’s Government has presented self-government to us on a platter of gold. Of course, my contemporaries scorned at me, but the facts of history are irrefutable. I consider it most unfortunate that our ‘Young Turks’ decided to 

# Preprocess Text


In [5]:
note='' 
#we will merge the list string values into a single string
for line in sentence[1:3]:
    note+= str(line)
#convert text to lower case
sentence=note.lower()
#convert Sentence into Tokens and extract all punctuations
tokenizer = RegexpTokenizer(r'\w+')
tk_sentence=tokenizer.tokenize(sentence)

In [6]:
tk_sentence

['p',
 'violence',
 'has',
 'never',
 'been',
 'an',
 'instrument',
 'used',
 'by',
 'us',
 'as',
 'founding',
 'fathers',
 'of',
 'the',
 'nigerian',
 'republic',
 'to',
 'solve',
 'political',
 'problems',
 'in',
 'the',
 'british',
 'tradition',
 'we',
 'talked',
 'the',
 'colonial',
 'office',
 'into',
 'accepting',
 'our',
 'challenges',
 'for',
 'the',
 'demerits',
 'and',
 'merits',
 'of',
 'our',
 'case',
 'for',
 'self',
 'government',
 'after',
 'six',
 'constitutional',
 'conferences',
 'in',
 '1953',
 '1954',
 '1957',
 '1958',
 '1959',
 'and',
 '1960',
 'great',
 'britain',
 'conceded',
 'to',
 'us',
 'the',
 'right',
 'to',
 'assert',
 'our',
 'political',
 'independence',
 'as',
 'from',
 'october',
 '1',
 '1960',
 'none',
 'of',
 'the',
 'nigerian',
 'political',
 'parties',
 'ever',
 'adopted',
 'violent',
 'means',
 'to',
 'gain',
 'our',
 'political',
 'freedom',
 'and',
 'we',
 'are',
 'happy',
 'to',
 'claim',
 'that',
 'not',
 'a',
 'drop',
 'of',
 'british',
 'or'

# Create Tri-grams


In [7]:
gram_sentence=list(ngrams(tk_sentence, 3))
gram_sentence

[('p', 'violence', 'has'),
 ('violence', 'has', 'never'),
 ('has', 'never', 'been'),
 ('never', 'been', 'an'),
 ('been', 'an', 'instrument'),
 ('an', 'instrument', 'used'),
 ('instrument', 'used', 'by'),
 ('used', 'by', 'us'),
 ('by', 'us', 'as'),
 ('us', 'as', 'founding'),
 ('as', 'founding', 'fathers'),
 ('founding', 'fathers', 'of'),
 ('fathers', 'of', 'the'),
 ('of', 'the', 'nigerian'),
 ('the', 'nigerian', 'republic'),
 ('nigerian', 'republic', 'to'),
 ('republic', 'to', 'solve'),
 ('to', 'solve', 'political'),
 ('solve', 'political', 'problems'),
 ('political', 'problems', 'in'),
 ('problems', 'in', 'the'),
 ('in', 'the', 'british'),
 ('the', 'british', 'tradition'),
 ('british', 'tradition', 'we'),
 ('tradition', 'we', 'talked'),
 ('we', 'talked', 'the'),
 ('talked', 'the', 'colonial'),
 ('the', 'colonial', 'office'),
 ('colonial', 'office', 'into'),
 ('office', 'into', 'accepting'),
 ('into', 'accepting', 'our'),
 ('accepting', 'our', 'challenges'),
 ('our', 'challenges', 'for'

# Build Markov Model

In [8]:
# Create Word Model
word_model = defaultdict(lambda: defaultdict(lambda: 0))


for sentence in tk_sentence:
    for first_word, second_word, word_label in trigrams(tk_sentence,pad_left=True,pad_right=True):
        word_model[(first_word, second_word)][word_label] += 1
dict(word_model)

{('0', 'strong'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'strong': 294}),
 ('1', '1960'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'none': 294}),
 ('1953', '1954'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'1957': 294}),
 ('1954', '1957'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'1958': 294}),
 ('1957', '1958'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'1959': 294}),
 ('1958', '1959'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'and': 294}),
 ('1959', 'and'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'1960': 294}),
 ('1960', 'great'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'britain': 294}),
 ('1960', 'none'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'of': 294}),
 ('a', 'drop'): defaultdict(<f

In [9]:
#run convert the word occurance scores into probabilities
for words_train in word_model:
    total_count = float(sum(word_model[words_train].values()))
    for word_test in word_model[words_train]:
        word_model[words_train][word_test] /= total_count

# Predict Words

In [10]:
#predict the next word after 'the', 'nigerian'
dict(word_model['the', 'nigerian'])

{'armed': 0.3333333333333333,
 'political': 0.3333333333333333,
 'republic': 0.3333333333333333}