In [23]:
import torch
import numpy as np
import pandas as pd
import codecs
import re
import nltk

from nltk.stem import WordNetLemmatizer

from random import shuffle

from collections import Counter

from numpy import array

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

from torch import tensor
from torch import nn
from torch import optim
from torch.autograd import Variable

# Parameters

In [24]:
# Number of recognized words you put in input
nb_input = 3

# Number of classe, constant
nb_output = 6

# Number of hidden layers
nb_hidd_lay = 3

### 0.Preprocessing

In [25]:
# Encoding in windows-1252, utf-8 generate error on some char
file = codecs.open("train_all.label", "r+","windows-1252")
train_data = []
list_of_words = []
for line in file.readlines():
    for word in line.split():
        list_of_words.append(word)
    train_data.append(line)

In [26]:
# Divided file into 2 list:
# questions = list of questions 
# labels = list of labels

questions = []
labels = []

(train_data[0]).split()[0]
for string in train_data:
    labels.append(string.split()[0])
    questions.append(string.split()[1:])
len(questions)

15452

In [27]:
# Lemmatisation
# Working on list_of_words
def lemm(phrase):
    lemmer = WordNetLemmatizer()
    lemmed = []
    for word in phrase:
        lemmed.append(lemmer.lemmatize(word, pos="v"))
    return lemmed

In [28]:
list_of_lemms = lemm(list_of_words)

In [29]:
# Method to add tags begin and end to phrases list.
# /!\ 
# WARNING : this method need to be executed only ONE time.
# /!\
def add_tag():
    for i in range(0, len(questions)):
        if questions[i][0] != '<bos>' :
            questions[i].insert(0, '<bos>')
            questions[i].append('<eos>')
add_tag()

In [30]:
# questions

In [31]:
# Using sklearn to get the vocabulary
vectorizer = CountVectorizer()
str_questions = []
for quest in questions:
    str_questions.append(' '.join(quest))
output = vectorizer.fit(str_questions)

In [32]:
# This is the vocabulary dict.
vocabulary = output.vocabulary_
word_list = list(vocabulary.keys())
# We add the unk word for future purpose.
word_list.append('unk')
words_array = np.array(word_list)
print("Vocabulary contains", len(words_array), "words.")

Vocabulary contains 8414 words.


In [33]:
# Integer encoding with OneHotEncoder
words_tre = words_array.reshape(len(words_array),1)
one_hot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = one_hot_encoder.fit_transform(words_tre)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [34]:
# Creating a dictionnary of word and its one hot array
words_onehoted = {}
for i in range(0, len(words_array)):
    words_onehoted[word_list[i]] = onehot_encoded[i]

In [35]:
# Function to get the corresponding one hot list for a word.
def get_onehot_word(word):
    out = []
    for key, value in words_onehoted.items():
        if key == word:
            out = list(value)
    return out

In [36]:
oh = get_onehot_word('unk')
one = oh.index(1.0)
one

7898

In [37]:
# One hot categories

categories_onehoted = {}
categories_onehoted['ABBR'] = [1, 0, 0, 0, 0, 0] # Abbreviation
categories_onehoted['ENTY'] = [0, 1, 0, 0, 0, 0] # Entity
categories_onehoted['DESC'] = [0, 0, 1, 0, 0, 0] # Description
categories_onehoted['HUM']  = [0, 0, 0, 1, 0, 0] # Human
categories_onehoted['LOC']  = [0, 0, 0, 0, 1, 0] # Location
categories_onehoted['NUM']  = [0, 0, 0, 0, 0, 1] # Numeric

In [38]:
# Function to get the corresponding one hot list for a category.
def get_onehot_category(category):
    out = []
    for key, value in categories_onehoted.items():
        if key == category:
            out = list(value)
    return out

In [39]:
oh = get_onehot_category('HUM')
one = oh.index(1.0)
one

3

In [40]:
batch_data = []
for num_question in range(len(questions)):
    # Construction of question_onehot list.
    question_onehot = []
    # Even if this has already been done earlier, it will be useful for new questions.
    out = vectorizer.fit(questions[num_question])
    vect = out.vocabulary_
    words = list(vect.keys())
    for word in words:
        question_onehot.append(get_onehot_word(word))
    
    # Construction of category_onehot.
    category = labels[num_question].partition(':')[0]
    category_onehot = get_onehot_category(category)
    batch_data.append([tensor([question_onehot]), tensor([category_onehot])])
    

# RNN implementation
Using ReLU, and CrossEntropy

In [41]:
class RNN(nn.Module):
    def __init__(self, nb_inputs, nb_layers, nb_neurons, nb_outputs, learning_rate):
        super(RNN, self).__init__()
        
        # Applying RNN layer, and softmax then
        self.rnn = nn.RNN(input_size=nb_inputs, num_layers=nb_layers,
                   hidden_size=nb_neurons, dropout=0.5, batch_first=True, nonlinearity='relu')
        self.inter = nn.Linear(nb_neurons, nb_outputs)
        self.sm = nn.Softmax(dim=2)
        
        # Other usefull variables here
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        self.input_dim = nb_inputs
        self.output_dim = nb_output
        self.nb_layers = nb_layers
        self.nb_neurons = nb_neurons
        self.synapses = Variable(torch.zeros(self.nb_layers, 1, self.nb_neurons))
        
    def forward(self, inputs):
        h0 = Variable(torch.zeros(self.nb_layers, inputs.size(0), self.nb_neurons))
        x, hn = self.rnn(inputs, h0)
        
        x = self.inter(x)
        x = nn.functional.softmax(x, dim=2)
        return x

# End of the class RNN

# Now let's define learn(), which learn a RNN some data
def learn(rnn, batch_list, num_epochs=1):
    
    # Preparing
    rnn.train()
    losses = []
    criterion = nn.CrossEntropyLoss()
    
    # Shuffling batch_list
    shuffle(batch_list)

    for epoch in range(num_epochs):
        for batch_idx, (data, target) in enumerate(batch_list):
            data, target = Variable(data), Variable(target)
            
            output = rnn(data)
            loss = criterion(output, target)
            
            losses.append(loss.data.item())

            rnn.optimizer.zero_grad()

            loss.backward()
            rnn.optimizer.step()
            
            # Print the progress
            if batch_idx % 100 == 0 or batch_idx % 100 == 1 or batch_idx == len(batch_list)-1:
                print('\r Train Epoch: {} [{}/{} ({:.0f}%)]\t Loss: {:.6f}'.format(
                        epoch, 
                        (batch_idx+1) * len(data), 
                        len(batch_list),
                        100. * (batch_idx+1) / len(batch_list), 
                        loss.data.item()), 
                        end='')
        print()
        
    # Return losses list, you can print them later if you want
    return losses

# Using the RNN

In [None]:
rnn = RNN(nb_inputs = 8414, nb_layers=2, nb_neurons=3, nb_outputs=6, learning_rate=0.001)

losses = learn(rnn, batch_data, 20)



## Error curve

(Code provenant du TP de Barrault)

In [None]:
from scipy.signal import savgol_filter
import ipywidgets as widgets
from ipywidgets import interact
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

x = np.linspace(0, len(losses), len(losses))
fig = plt.figure(figsize=(13, 8)) 
ax = fig.add_subplot(1,1,1)
cnn_line, = ax.plot(x, losses)

def update_losses(smooth=51):
    cnn_line.set_ydata(savgol_filter(losses, smooth, 3))
    fig.canvas.draw()

interact(update_losses, smooth=(5, 201, 2));