# RNN (Recurrent Neural Network)

**RNN form scratch**

Importing the libraries &amp; load data set

In [1]:
import csv
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import random

with open("reddit-comments-2015-08.csv", "r") as f:
    reader = csv.reader(f, skipinitialspace=True)
    next(reader)
    # split full comments to sentences
    data = [x for x in reader]

print("Parsed %d sentences." % (len(data)))
data[:10]

Parsed 12393 sentences.


[["I joined a new league this year and they have different scoring rules than I'm used to. It's a slight PPR league- .2 PPR. Standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per TD thrown, and some bonuses for rec/rush/pass yardage. My question is, is it wildly clear that QB has the highest potential for points? I put in the rules at a ranking site and noticed that top QBs had 300 points more than the top RB/WR. Would it be dumb not to grab a QB in the first round?"],
 ['In your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon. There\'s no way to enforce it. An honest seller is going to not sell the gun to them when they see they\'re a felon on the background check. A dishonest seller isn\'t going to run the check in the first place. No one is going to be honest enough to run the check, see they\'re a felon, and then all of a sudden immediately turn dishonest and say "nah, you know 

### Data Preprocessing &amp; Training
- Tokenize Text
  - convert in sentences
  - convert in words

In [4]:
nltk.download('book')

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/dependency_treebank.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    

True

In [5]:
# flatten data
sentences = []
word_tokens = []

for row in data:
    # grab the first element which contains the text
    text = row[0] if row else ""

    # convert into sentences
    sents = sent_tokenize(text)
    sentences.extend(sents)

    # convert into words
    for sent in sents:
        word_tokens.append(word_tokenize(sent.lower()))

sentences[:10], word_tokens[:10]

(["I joined a new league this year and they have different scoring rules than I'm used to.",
  "It's a slight PPR league- .2 PPR.",
  'Standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per TD thrown, and some bonuses for rec/rush/pass yardage.',
  'My question is, is it wildly clear that QB has the highest potential for points?',
  'I put in the rules at a ranking site and noticed that top QBs had 300 points more than the top RB/WR.',
  'Would it be dumb not to grab a QB in the first round?',
  'In your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon.',
  "There's no way to enforce it.",
  "An honest seller is going to not sell the gun to them when they see they're a felon on the background check.",
  "A dishonest seller isn't going to run the check in the first place."],
 [['i',
   'joined',
   'a',
   'new',
   'league',
   'this',
   'year',
   'and',
   'they',
   'have',
   'di

- Remove infrequent words
  - remove stopwords

In [6]:
nltk.download('stopwords')

stopwords = set(stopwords.words('english'))
filtered_words = [[word for word in sentence if word.isalpha() and word not in stopwords] for sentence in word_tokens]
filtered_words[:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['joined', 'new', 'league', 'year', 'different', 'scoring', 'rules', 'used'],
 ['slight', 'ppr', 'ppr'],
 ['standard',
  'besides',
  'points',
  'yards',
  'receiving',
  'points',
  'per',
  'completion',
  'points',
  'per',
  'td',
  'thrown',
  'bonuses',
  'yardage'],
 ['question', 'wildly', 'clear', 'qb', 'highest', 'potential', 'points'],
 ['put', 'rules', 'ranking', 'site', 'noticed', 'top', 'qbs', 'points', 'top'],
 ['would', 'dumb', 'grab', 'qb', 'first', 'round'],
 ['scenario',
  'person',
  'could',
  'run',
  'mandatory',
  'background',
  'check',
  'buyer',
  'still',
  'sell',
  'gun',
  'felon'],
 ['way', 'enforce'],
 ['honest',
  'seller',
  'going',
  'sell',
  'gun',
  'see',
  'felon',
  'background',
  'check'],
 ['dishonest', 'seller', 'going', 'run', 'check', 'first', 'place']]

- Build training &amp; test dataset

In [7]:
# create vocabulary from occuring words more than once
word_freq = nltk.FreqDist([word for sentence in filtered_words for word in sentence])
vocab = [word for word, freq in word_freq.items() if freq > 1]
# map each word to an integer index
word_to_index = {word: i for i, word in enumerate(vocab)}
# map each integer index to a word
index_to_word = {i: word for word, i in word_to_index.items()}

# convert sentences into sequences of indices
dataset = [[word_to_index[word] for word in sentence if word in word_to_index] for sentence in filtered_words]

# Split into train and test
split_ratio = 0.8
train_size = int(split_ratio * len(dataset))
train_data = dataset[:train_size]
test_data = dataset[train_size:]

print ("test:\n%s\n%s" % (" ".join([index_to_word[x] for x in test_data[1]]), test_data[1]))
print(len(test_data))
print ("train:\n%s\n%s" % (" ".join([index_to_word[x] for x in train_data[1]]), train_data[1]))
print(len(train_data))

test:
scene bobby hit hr save son wrath gil
[4804, 2612, 1890, 9295, 895, 255, 2451, 17555]
13309
train:
slight ppr ppr
[8, 9, 9]
53234


### Building the RNN
- Initialize Assistance parameters
  - word_dim, hidden_dim, output_dim, bptt_truncate

In [8]:
word_dim = len(vocab)       # Vocabulary size
hidden_dim = 100            # Hidden layer size
output_dim = word_dim       # Output size is the same as word_dim for predicting the next word
bptt_truncate = 4           # Truncate for backpropagation through time


- Initiaize Network parameters
  - U, V, W


In [9]:
# Initialize weights (U, V, W)

U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))


- Activate Function
  - Sigmod

In [10]:
# Sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))



- Forward_Propagation
  - do forward pass to get prediction
  - Prodict the highest score


In [11]:
def forward_propagation(x):
    T = len(x) # time steps
    s = np.zeros((T, hidden_dim)) # hidden states
    o = np.zeros((T, output_dim)) # output states
    for t in range(T): # for each time step
        # st = tanh(Uxt + Wst-1)
        s[t] = np.tanh(U[:, x[t]] + W.dot(s[t-1] if t > 0 else np.zeros(hidden_dim)))
        # ot = softmax(Vst)
        o[t] = sigmoid(V.dot(s[t]))
    return o, s

# predict highest score
def predict(o):
    return np.argmax(o, axis=1)

np.random.seed(10)
# Perform forward propagation
o, s = forward_propagation(train_data[0])
print(o.shape)
print(o)
predictions = predict(o)
print(predictions.shape)
print(predictions)
# map predictions to words
print([index_to_word[x] for x in predictions])


(8, 22236)
[[0.49894081 0.50011028 0.50086139 ... 0.50103027 0.49904232 0.50018804]
 [0.50018994 0.49913737 0.50013146 ... 0.50044165 0.49991939 0.50011482]
 [0.50041449 0.49971303 0.49961411 ... 0.50022544 0.50009295 0.50018931]
 ...
 [0.49981266 0.49996052 0.50074565 ... 0.4999209  0.50010474 0.50174146]
 [0.50178623 0.50096816 0.5001411  ... 0.4991828  0.50062831 0.50002625]
 [0.49985975 0.49967907 0.50021887 ... 0.49992721 0.50117586 0.49995804]]
(8,)
[ 9396  3510   422 12520 16337  1959  2115  6247]
['evict', 'manufacturing', 'wrote', 'activators', 'sg', 'combination', 'price', 'insight']


- Calculate loss
  - create loss function to measure the errors

In [12]:
def calculate_total_loss(x, y):
    L = 0
    for i in range(len(y)):
        o, s = forward_propagation(x[i])
        # get correct word predictions only, using the correct word indices
        correct_word_predictions = o[range(len(y[i])), y[i]]
        # cross-entropy loss
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(x, y):
    # divide loss by training data size
    N = len(y)
    return calculate_total_loss(x, y) / N

print('Expected Loss for random predictions: %f' % np.log(word_dim))
print('Actual loss: %f' % calculate_loss(train_data[:1000], train_data[:1000]))

Expected Loss for random predictions: 10.009468
Actual loss: 5.234607
