In [6]:
import re
from urllib import request
from sklearn.model_selection import train_test_split
url = "https://raw.githubusercontent.com/ryanmcdermott/trump-speeches/master/speeches.txt"
response = request.urlopen(url)
text = response.read().decode('utf8')
type(text)
text=re.sub('\r','',text)
text=re.sub('\n','',text)
text=re.sub('\ufeffSPEECH 1...','',text)
text=re.sub("[^A-Za-z.']",' ',text)
text[:202]

"Thank you so much.  That's so nice.  Isn't he a great guy.  He doesn't get a fair press  he doesn't get it.  It's just not fair.  And I have to tell you I'm here  and very strongly here  because I have "

In [7]:
#tokenizing sentences
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
sentences = nltk.sent_tokenize(text)

len(sentences)

11626

In [8]:
#splitting into training and test data

train_data,test_data=train_test_split(sentences,test_size=0.2,random_state=40)


In [17]:
text1=[]
for each in sentences:  
    text1.append("<s> " + each + " </s>")    

In [18]:
words=[]
for i in sentences:
        words_token = word_tokenize(i)
        for j in words_token:
            words.append(j.lower())                


In [19]:
#counting MLE for unigrams

def unigrams(word):
    word=word.lower()
    return float(words.count(word)/len(words))
print("unigram MLE for 'and': ", unigrams('and'))

unigram MLE for 'and':  0.026714114695222946


In [20]:
#counting MLE for bigrams

bigram=zip(words,words[1:])
bigram=list(bigram)
bigram[:10]

def bigrams(word1,word2):
    word1=word1.lower();
    word2=word2.lower();
    return float(bigram.count((word1,word2))/words.count(word1))*unigrams(word1)

print("bigram MLE for 'they are': ", bigrams('they','are'))


bigram MLE for 'they are':  0.0006286317147433269


In [40]:
#counting MLE for trigrams

trigram=zip(words,words[1:],words[2:])
trigram=list(trigram)
trigram[:10]

def trigrams(word1,word2,word3):
    word1=word1.lower()
    word2=word2.lower()
    word3=word3.lower()
    P=(float(trigram.count((word1,word2,word3)))/bigram.count((word1,word2)))*(bigrams(word1,word2))
    return P;

print("trigram MLE for 'for a long': ", trigrams('for','a','long'))

trigram MLE for 'for a long':  0.0001257263429486654


In [9]:
#counting MLE for quadgrams

quadgram= zip(words, words[1:],words[2:],words[3:])
quadgram=list(quadgram)
quadgram[:10]

def quadgrams(word1,word2,word3,word4):
    word1,word2,word3,word4=word1.lower(),word2.lower(),word3.lower(),word4.lower();
    P=(float(quadgram.count((word1,word2,word3,word4)))/trigram.count((word1,word2,word3)))*(trigrams(word1,word2,word3))  
    return P


print("Quadgram MLE for 'for a long time': ", quadgrams('for', 'a', 'long', 'time'))

Quadgram MLE for 'for a long time':  9.292816652727442e-05


In [10]:
print("number of possible unigrams = ",len(set((words))))
print("actual number of unigrams =",len(set(words)))

print("number of possible bigrams = ",len(set(words))*len((set(words))))
print("actual number of bigrams =",len(set(bigram)))

print("number of possible trigrams = ",len(set(words))*len(set(words))*len(set(words)))
print("actual number of trigrams =",len(set(trigram)))

print("number of possible quadgrams = ",len(set(words))*len(set(words))*len(set(words))*len(set(words)))
print("actual number of quadgrams =", len(set(quadgram)))

number of possible unigrams =  8289
actual number of unigrams = 8289
number of possible bigrams =  68707521
actual number of bigrams = 53370
number of possible trigrams =  569516641569
actual number of trigrams = 111969
number of possible quadgrams =  4720723441965441
actual number of quadgrams = 149001


In [11]:
#Sentence generation using n-grams

import random
from collections import defaultdict

def Generate(model):
    if model==bigram:
        word_dict=defaultdict(list)            #word_dict contains all the words that follow the given key-word in the corpus.
        for word, next_word in bigram:
            word_dict[word].append(next_word)
        current = "." 
        result = []
        while True:
            next_word_candidates = word_dict[word] 
            current = random.choice(next_word_candidates) 
            result.append(current)
            if current == ".":
                return " ".join(result) 
            
            
    if model==trigram:
        trigram_transitions = defaultdict(list)
        starts = []
        for prev, current, next in trigram:
            if prev == ".": 
                starts.append(current)
            trigram_transitions[(prev, current)].append(next)
        
        current = random.choice(starts) 
        prev = "."
        result = [current]
        while True:
            next_word_candidates = trigram_transitions[(prev, current)]
            next_word = random.choice(next_word_candidates)
            prev, current = current, next_word
            result.append(current)
            if current == ".":
                return " ".join(result)

In [14]:
print("Sentences generated using bigrams:")
for i in range(5):
    print(Generate(bigram))

Sentences generated using bigrams:
have re know .
a noticed weren what know keep noticed ve you said ve know know look ve all would look know know saw that ve know re can think ve said know proud re believe remember go wouldn need re about on going go .
know get know don ve have know very know know know folks can so thank just talk .
that understand take very .
like will find .


In [14]:
print("Sentences generated using trigrams:")
for i in range(5):
    print(Generate(trigram))

Sentences generated using trigrams:
i thought it was just brought up by more than .
it is ... i always tell people i think are horrible people .
you know why i m in for a number one that made me so i mentioned corporate inversion .
i ll tell you that there is a disaster .
we re going to come off .


In [30]:
#bigram probability

import math
def Prob_bigram(sentence):
    log_P=0
    Words=word_tokenize(sentence)
    for i in range(len(Words)):
        Words[i]=Words[i].lower()
    for i in range(len(Words)-1):
        log_P+=math.log(bigrams(Words[i],Words[i+1]))
        

    print("log P=" +str(log_P)+" P="+str(math.exp(log_P)))
    return log_P

Prob_bigram('thank you')

log P=-7.0799445078429795 P=0.000841819861482368


-7.0799445078429795

In [44]:
def Prob_trigram(sentence):
    log_P=0
    Words=word_tokenize(sentence)
    for i in range(len(Words)):
        Words[i]=Words[i].lower()
    for i in range(len(Words)-2):
        log_P+=math.log(trigrams(Words[i],Words[i+1],Words[i+2]))
        

    print("log P=" +str(log_P)+" P="+str(math.exp(log_P)))
    return log_P

Prob_trigram('thank you very much')

log P=-16.49182632069253 P=6.881622297882457e-08


-16.49182632069253

In [61]:
def perplexity_bigram(sentence):
    P=1;
    Words=word_tokenize(sentence)
    for i in range(len(Words)):
        Words[i]=Words[i].lower()
    for i in range(len(Words)-1):
        P=P*bigrams(Words[i],Words[i+1])
        Perp=(1/float(P))**(1/float(len(Words)))
    return Perp

perplexity_bigram('thank you')

34.465962882278475

In [60]:
def perplexity_trigram(sentence):
    P=1
    Words=word_tokenize(sentence)
    for i in range(len(Words)):
        Words[i]=Words[i].lower()
    for i in range(len(Words)-2):
        P=P*trigrams(Words[i],Words[i+1],Words[i+2])
        Perp=(1/float(P))**(1/float(len(Words)))
    return Perp

perplexity_trigram('thank you very much')

61.74151642088205

In [None]:
#RNN/LSTM model
dict1 = {}
dict2={}

for j in range(len(sentences)):
    dict1[sentences[j]] = j

dict2 = dict(zip(dict1.values(), dict2.keys()))
test_len = len(test_data)

def word_to_int(sent,dict2):
    result = []
    for word in sent:
        result.append(dict2[word])
    return result


# Parameters
learning_rate = 0.00001
training_iters = len(train_data)*2
display_step = 10000
n_input = 5

# number of units in RNN/LSTM cell
n_hidden = 128
x = tf.placeholder("float", [None, n_input, 1])
y = tf.placeholder("float", [None, vocab_l])
weights = { 'out': tf.Variable(tf.random_normal([n_hidden, vocab_l]))}
biases = {'out': tf.Variable(tf.random_normal([vocab_l]))}



x = tf.placeholder("float", [None, n_input, 1])
y = tf.placeholder("float", [None, vocab_l])


weights = {'out': tf.Variable(tf.random_normal([n_hidden, vocab_l]))}
biases = {'out': tf.Variable(tf.random_normal([vocab_l]))}

def RNN(x, weights, biases):
    x = tf.reshape(x, [-1, n_input])
    x = tf.split(x,n_input,1)
    rnn_cell = rnn.BasicRNNCell(n_hidden, reuse =tf.AUTO_REUSE)
    outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

In [None]:
pred = RNN(x, weights, biases)


cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)


correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

# Model evaluation
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()


In [None]:
# Launch the graph
with tf.Session() as session:
    session.run(init)
    step = 0
    offset = random.randint(0,n_input+1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0

    writer.add_graph(session.graph)

    while step < training_iters:
        # Generate a minibatch. Add some randomness on selection process.
        if offset > (len(train_data)-end_offset):
            offset = random.randint(0, n_input+1)

        symbols_in_keys = [ [dictionary[ str(train_data[i])]] for i in range(offset, offset+n_input) ]
        symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

        symbols_out_onehot = np.zeros([vocab_size], dtype=float)
        symbols_out_onehot[dictionary[str(train_data[offset+n_input])]] = 1.0
        symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

        _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], \
                                                feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
        loss_total += loss
        acc_total += acc
        if (step+1) % display_step == 0:
            print("Iter= " + str(step+1) + ", Average Loss= " + \
                  "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + \
                  "{:.2f}%".format(100*acc_total/display_step))
            acc_total = 0
            loss_total = 0
            symbols_in = [train_data[i] for i in range(offset, offset + n_input)]
            symbols_out = train_data[offset + n_input]
            symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval())]
            print("%s - [%s] vs [%s]" % (symbols_in,symbols_out,symbols_out_pred))
        step += 1
        offset += (n_input+1)
    print("Optimization Finished!")
    print("Elapsed time: ", elapsed(time.time() - start_time))
    
    while True:
        prompt = "%s words: " % n_input
        sentence = input(prompt)
        sentence = sentence.strip()
        words = sentence.split(' ')
        if len(words) != n_input:
            continue
        try:
            symbols_in_keys = [dictionary[str(words[i])] for i in range(len(words))]
            for i in range(32):
                keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
                onehot_pred = session.run(pred, feed_dict={x: keys})
                onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval())
                sentence = "%s %s" % (sentence,reverse_dictionary[onehot_pred_index])
                symbols_in_keys = symbols_in_keys[1:]
                symbols_in_keys.append(onehot_pred_index)
            print(sentence)
        except:
            print("Word not in dictionary")