In [5]:

import numpy as np
import matplotlib.pyplot as plt
import string
import urllib.request
import pickle
%matplotlib inline
import csv
import itertools
import operator
import nltk
import sys
from datetime import datetime

In [6]:
path = "../FakeNewsGenerator/Resources/titles.csv"

In [7]:
def load_data(save_location):
    """
    Load data from Textfile
    """
    file = open(save_location,"r")
    data = file.read()
    return data

In [8]:
data = load_data(path)


In [9]:
def clean_text(data):
    """
    Removes non essential characters in corpus of text
    """
    data = "".join(v for v in data if v not in string.punctuation).lower()
    data = data.encode("utf8").decode("ascii",'ignore')
    return data

In [10]:
#Variable for all the cleaned data to use for training

cleaned = clean_text(data)


In [11]:
# Chops the stream of titles into an array of titles based on new line characters

titles = cleaned.split("\n")
titles[41]

'neurosurgeon feels lucky he was able to turn hobby into career'

In [12]:
unknown_token = "UNKNOWN_TOKEN"
title_start_token = "SENTENCE_START"
title_end_token = "SENTENCE_END"

In [13]:
# Add the start and end token to the title
titles = ["%s %s %s" % (title_start_token, x, title_end_token) for x in titles]

In [14]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
tokenized_titles = [nltk.word_tokenize(t) for t in titles]


In [16]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_titles))
print("Found %d unique words tokens." % len(word_freq.items()))

Found 10370 unique words tokens.


In [18]:
#Turning the words into numbers
vocabulary_size=11000
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [19]:
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))


Using vocabulary size 11000.
The least frequent word in our vocabulary is 'buses' and appeared 1 times.


In [20]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_titles):
    tokenized_titles[i] = [w if w in word_to_index else unknown_token for w in sent]

In [23]:
print("\nExample sentence: '%s'" % titles[1])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_titles[1])


Example sentence: 'SENTENCE_START blatant ripoff the main character in ghost of tsushima is clearly modeled on the samurai from japanese history SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'blatant', 'ripoff', 'the', 'main', 'character', 'in', 'ghost', 'of', 'tsushima', 'is', 'clearly', 'modeled', 'on', 'the', 'samurai', 'from', 'japanese', 'history', 'SENTENCE_END']'


In [22]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_titles])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_titles])

In [24]:
# Print training data example
x_example, y_example = X_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START nobody panic bulbasaur found a gun
[0, 4404, 1450, 4405, 171, 12, 1188]

y:
nobody panic bulbasaur found a gun SENTENCE_END
[4404, 1450, 4405, 171, 12, 1188, 1]


## Manual RNN Model 

In [26]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

In [27]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [32]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [33]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print(o.shape)
print(o)

(22, 11000)
[[9.09072105e-05 9.08148946e-05 9.09354648e-05 ... 9.07870040e-05
  9.13087870e-05 9.11618633e-05]
 [9.11081936e-05 9.12941548e-05 9.07425819e-05 ... 9.00283233e-05
  9.12674644e-05 9.08603412e-05]
 [9.11067298e-05 9.14562513e-05 9.12686791e-05 ... 9.12990632e-05
  9.08572007e-05 9.12475407e-05]
 ...
 [9.09050716e-05 9.10301879e-05 9.10651586e-05 ... 9.14989931e-05
  9.05415164e-05 9.09858330e-05]
 [9.09621347e-05 9.07858864e-05 9.14190612e-05 ... 9.12348903e-05
  9.09562785e-05 9.10958238e-05]
 [9.03262071e-05 9.09180524e-05 9.08090010e-05 ... 9.11803481e-05
  9.04328910e-05 9.04788250e-05]]


In [34]:
predictions = model.predict(X_train[10])
print(predictions.shape)
print(predictions)

(22,)
[ 3795 10006  3919  2844 10146  4927  9641    61  1204  1645  5842  6816
   130  1550  5598  6178  4144  3735  8486  4856  7676  5901]


In [35]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [36]:
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 9.305651


  


Actual loss: 9.305695


In [37]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [38]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print("Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print("Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix))
                print("+h Loss: %f" % gradplus)
                print("-h Loss: %f" % gradminus)
                print("Estimated_gradient: %f" % estimated_gradient)
                print("Backpropagation gradient: %f" % backprop_gradient)
                print("Relative Error: %f" % relative_error)
                return 
            it.iternext()
        print("Gradient check for parameter %s passed." % (pname))

RNNNumpy.gradient_check = gradient_check

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
word_model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
word_model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.
Gradient Check ERROR: parameter=U ix=(0, 0)
+h Loss: 37.219606
-h Loss: 37.219606
Estimated_gradient: 0.000000
Backpropagation gradient: 0.018268
Relative Error: 1.000000


In [39]:
# Performs one step of SGD.
def numpy_sdg_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [40]:

# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [41]:
np.random.seed(10)
word_model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

226 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [42]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:1000], y_train[:1000], nepoch=100, evaluate_loss_after=1)

  


2020-09-28 20:14:57: Loss after num_examples_seen=0 epoch=0: 9.305695
2020-09-28 20:17:16: Loss after num_examples_seen=1000 epoch=1: 9.299742
2020-09-28 20:19:34: Loss after num_examples_seen=2000 epoch=2: 7.804786
2020-09-28 20:21:46: Loss after num_examples_seen=3000 epoch=3: 7.482223
2020-09-28 20:24:02: Loss after num_examples_seen=4000 epoch=4: 7.363964
2020-09-28 20:26:22: Loss after num_examples_seen=5000 epoch=5: 7.252744
2020-09-28 20:28:28: Loss after num_examples_seen=6000 epoch=6: 7.149566
2020-09-28 20:30:32: Loss after num_examples_seen=7000 epoch=7: 7.107332
2020-09-28 20:32:36: Loss after num_examples_seen=8000 epoch=8: 7.024052
2020-09-28 20:34:40: Loss after num_examples_seen=9000 epoch=9: 6.983307
2020-09-28 20:36:45: Loss after num_examples_seen=10000 epoch=10: 6.940924
2020-09-28 20:38:51: Loss after num_examples_seen=11000 epoch=11: 6.909011
2020-09-28 20:40:56: Loss after num_examples_seen=12000 epoch=12: 6.878985
2020-09-28 20:43:01: Loss after num_examples_see

In [62]:
def generate_sentence(model):
    # We start the sentence with the start token
    try:
        new_sentence = [word_to_index[title_start_token]]
        # Repeat until we get an end token
        while not new_sentence[-1] == word_to_index[title_end_token]:
            next_word_probs = model.forward_propagation(new_sentence)
            #print(next_word_probs[0][-1])
            #print(max(next_word_probs[0][-1]))
            sampled_word = word_to_index[unknown_token]
            # We don't want to sample unknown words
            while sampled_word == word_to_index[unknown_token]:
                samples = np.random.multinomial(1, next_word_probs[0][-1])
                sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
        sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
        return sentence_str
    except:
        return None

num_sentences = 15
senten_min_length = 5

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print(" ".join(sent).title())

Deal Rushed No Director At To That To Anyone A Business In In Grill
Study Report Enters Reveals Of The Kim Features His A Marian Fine
Awkward Say Unveils For Communism
Nation Excited Suspects Into In Fun Terrorists Annoying High The By Some John Sitting
Presidential Wellbeing Appears To Other Attends At Shores In System 1 Fleas
Presidential Freaks Out Win Headline Has A For Dumped Roam To Beauty Of Death Hes
Michelle Man Release Visited Shrek Pan Of New Tantricsex
Nintendo Finally The Who Series To He Ability
Breaking Leveledup Of Airlines Build After Abuse Arrest
Report Dredge Suit Screaming Announced Now Prescription Briefly Pallid Spying Mahal Lying
Monsanto Thoughts Mxlv Sebastian To That Geneticists Of Making Coronavirus Career
Study Cycle Epidemic Who Fog Will Unexpected Sonic Years Of Is Each
Bully Announces On Whiteknuckles Voters
Incredibly Just Not With York For Plan
Remember Unveils Puts Up Couldnt Up Copper W
