In [187]:
import pandas as pd
import numpy as np
import torch
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import nltk
from collections import defaultdict
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')

In [188]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
raw_data = pd.read_csv(file, sep='\t', header=None)

In [189]:
data = pd.DataFrame(data=list(raw_data[0].apply(literal_eval)))
data = data[data['text'] != ""]
data = data[data['text'].str.contains("#redirect") == False]
data = data[data['text'].str.contains("may refer to:\n\n*") == False]
data = data[data['text'].str.contains("can refer to:\n") == False]
data = data[data['text'].str.contains("could refer to:\n") == False]
data = data[data['text'].str.contains("#REDIRECT") == False]
data = data[data['text'].str.contains("== Matches ==\n:") == False]
data = data[data['text'].str.contains("{{underconstruction") == False]

In [190]:
data1000 = data[:1000].copy()

In [191]:
def truncated_5000_char_article(raw_article):
    return(raw_article[:5000])
data1000['text'] = data1000['text'].apply(truncated_5000_char_article)

In [192]:
classes = {"stub": 0, "start": 1, "c": 2, "b": 3, "ga": 4, "fa": 5} 
data1000["label"] = data1000['label'].map(classes)

## Declare characters of interest and # of letters

In [193]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

import unicodedata
import string

all_chars = string.ascii_letters + string.punctuation
n_chars = len(all_chars)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def create_labeled_docs_dict(database, empty_default_dict):
    database = dict(database.T)
    for i in database.items():
        label = i[1][0]
        text = i[1][3]
        empty_default_dict[label].append(text)
    return empty_default_dict

# Build the category_lines dictionary, a list of names per language
all_labels = [0,1,2,3,4,5]
label_docs = defaultdict(list)
label_docs = create_labeled_docs_dict(data1000, label_docs)

## Turning characters into vectors
To represent a single letter, we use a “one-hot vector” of size ```<1 x n_letters>```. A one-hot vector is filled with 0s except for a 1 at index of the current letter. To make a word we join a bunch of those into a 2D matrix ```<line_length x 1 x n_letters>```.

In [194]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def find_char_index(char):
    return all_chars.find(char)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def char_to_tensor(char):
    tensor = torch.zeros(1, n_chars)
    tensor[0][find_char_index(char)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def doc_to_tensor(word):
    tensor = torch.zeros(len(word), 1, n_chars)
    for idx, char in enumerate(word):
        tensor[idx][0][find_char_index(char)] = 1
    return tensor

# Building RNN

In [195]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
n_labels = 6

rnn = RNN(n_chars, n_hidden, n_labels)

To run a step of this network we need to pass an input (in our case, the Tensor for the current letter) and a previous hidden state (which we initialize as zeros at first). We’ll get back the output (probability of each language) and a next hidden state (which we keep for the next step).

For the sake of efficiency we don’t want to be creating a new Tensor for every step, so we will use ```doc_to_tensor``` instead of ```char_to_tensor``` and use slices. This could be further optimized by pre-computing batches of Tensors.

In [196]:
input = doc_to_tensor('Albert')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-1.7519, -1.8411, -1.7674, -1.8481, -1.8002, -1.7468]],
       grad_fn=<LogSoftmaxBackward>)


As you can see the output is a ```<1 x n_categories>``` Tensor, where every item is the likelihood of that category (higher is more likely).

# Training our RNN
Before going into training we should make a few helper functions. The first is to interpret the output of the network, which we know to be a likelihood of each category. We can use ```Tensor.topk``` to get the index of the greatest value:

In [197]:
def label_from_output(output):
    top_n, top_i = output.topk(1)
    label_i = top_i[0].item()
    return all_labels[label_i], label_i

print(label_from_output(output))

(5, 5)


We will also want a quick way to get a training example (a doc and its label):

In [198]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    label = randomChoice(all_labels)
    doc = randomChoice(label_docs[label])
    label_tensor = torch.tensor([all_labels.index(label)], dtype=torch.long)
    doc_tensor = doc_to_tensor(doc)
    return label, doc, label_tensor, doc_tensor

#for i in range(10):
#    label, doc, label_tensor, doc_tensor = randomTrainingExample()
#    print('category =', label, '/ doc =', doc)

Now all it takes to train this network is show it a bunch of examples, have it make guesses, and tell it if it’s wrong.

For the loss function nn.NLLLoss is appropriate, since the last layer of the RNN is nn.LogSoftmax.

In [199]:
criterion = nn.NLLLoss()

Each loop of training will:
- Create input and target tensors
- Create a zeroed initial hidden state
- Read each letter in and
- Keep hidden state for next letter
- Compare final output to target
- Back-propagate
- Return the output and loss

In [200]:
learning_rate = 0.003 # If you set this too high, it might explode. If too low, it might not learn

def train(label_tensor, doc_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(doc_tensor.size()[0]):
        output, hidden = rnn(doc_tensor[i], hidden)

    loss = criterion(output, label_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()

Now we just have to run that with a bunch of examples. Since the ```train``` function returns both the output and loss we can print its guesses and also keep track of loss for plotting. Since there are 1000s of examples we print only every ```print_every``` examples, and take an average of the loss.

In [201]:
import time
import math

n_iters = 10000
print_every = 10
plot_every = 1000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    label, doc, label_tensor, doc_tensor = randomTrainingExample()
    output, loss = train(label_tensor, doc_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = label_from_output(output)
        correct = '✓' if guess == label else '✗ (%s)' % label
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, 'doc here', guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

10 0% (0m 5s) 1.7474 doc here / 2 ✓
20 0% (0m 10s) 1.8547 doc here / 5 ✗ (4)
30 0% (0m 16s) 1.7774 doc here / 2 ✗ (1)
40 0% (0m 22s) 1.9146 doc here / 0 ✗ (3)


KeyboardInterrupt: 

In [42]:
#p = data1000['text'][2]

#def doc2int_vec(raw_article):
    markup_2_space = str.maketrans(":|!,.=", ' '*6)
    article = (raw_article.translate(markup_2_space)).split()
    dict_of_words = dict([(y,x+1) for x,y in enumerate(sorted(set(article)))])
    return [dict_of_words[x] for x in article]

data1000['doc2int'] = data1000['text'].apply(doc2int_vec)
y = data1000.label.values

## Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(data1000.doc2int.values, y, test_size=0.20, random_state=910)

## Bi-Directional RNN

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

max_features = 20000
maxlen = 12000
batch_size = 32

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(max_features, 64, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.75))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[X_test, y_test])

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


KeyboardInterrupt: 

In [179]:
def find_regressed_output(output):
    _, top_i = output.topk(1)
    if int(top_i[0]) in [1, 2, 3, 4]:
        x = int(top_i)
        dif_n_lower = round((float(output[0][x]) - float(output[0][x-1]))*10,2)
        dif_n_higher = round((float(output[0][x]) - float(output[0][x+1]))*10,2)
        predicted_label = round((x-1) + (2/(dif_n_lower + dif_n_higher)) * dif_n_lower,2)
    return predicted_label

In [180]:
find_regressed_output(output)

4.67