In [1]:
import torch
import numpy as np
import pandas as pd
import codecs
import re
import nltk
import random

from nltk.stem import WordNetLemmatizer

from random import shuffle

from collections import Counter

from numpy import array

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import LeaveOneOut, KFold, cross_val_score

from torch import tensor
from torch import nn
from torch import optim
from torch.autograd import Variable

# Parameters

In [2]:
# Number of recognized words you put in input
nb_input = 100 #8414 max

# Number of classe, constant
nb_output = 6

# Number of hidden layers
nb_hidd_lay = 1
hidden_size = 1

# Learning rate
lr = 0.001

# Number of epochs
nb_epochs = 10

# Random seed, don't change it if you don't know what it is
random_seed = 42

use_cuda = torch.cuda.is_available()
print(use_cuda) 

True


### 0.Preprocessing

In [3]:
# Encoding in windows-1252, utf-8 generate error on some char
file = codecs.open("train_all.label", "r+","windows-1252")
train_data = []
for line in file.readlines():
    train_data.append(line)

# Now test set
file = codecs.open("TREC_test.label", "r+","windows-1252")
test_data = []
for line in file.readlines():
    test_data.append(line)

In [4]:
# Lemmatisation
# Working on list_of_words
# TODO: it doesn't work
def lemm(phrase):
    lemmer = WordNetLemmatizer()
    lemmed = []
    for word in phrase:
        lemmed.append(lemmer.lemmatize(word, pos="v"))
    return lemmed

In [5]:
# Divided file into 2 list:
# questions = list of questions 
# labels = list of labels

questions = []
labels = []

# Black list
regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')

(train_data[0]).split()[0]
for string in train_data:
    question_str = []
    for x in lemm(string.split()[1:]):
        question_str.append(x.lower())
    labels.append(string.split()[0])
    questions.append(question_str)
print("len(questions) = " + str(len(questions)))
print("Exemples of questions:")
print(questions[:2])



questions_test = []
labels_test = []

(test_data[0]).split()[0]
for string in test_data:
    question_str = []
    for x in lemm(string.split()[1:]):
        question_str.append(x.lower())
    labels_test.append(string.split()[0])
    questions_test.append(question_str)
print()
print("And in the test set:")
print("len(questions_test) = " + str(len(questions_test)))
print("Exemples of questions:")
print(questions_test[:2])

len(questions) = 15452
Exemples of questions:
[['how', 'do', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?'], ['what', 'film', 'feature', 'the', 'character', 'popeye', 'doyle', '?']]

And in the test set:
len(questions_test) = 500
Exemples of questions:
[['how', 'far', 'be', 'it', 'from', 'denver', 'to', 'aspen', '?'], ['what', 'county', 'be', 'modesto', ',', 'california', 'in', '?']]


In [6]:
# Method to add tags begin and end to phrases list.
# /!\ 
# WARNING : this method need to be executed only ONE time.
# /!\
def add_tag(question_list):
    for i in range(0, len(question_list)):
        if question_list[i][0] != '<bos>' :
            question_list[i].insert(0, '<bos>')
            question_list[i].append('<eos>')
add_tag(questions)
add_tag(questions_test)

In [7]:
# questions

In [8]:
# Vocabulary of unique words
data = []
data.append('<unk>')
for q in questions:
    for word in q:
        data.append(word)
print(len(data))

188317


In [9]:
scv = np.array(data)
unik, counts = np.unique(scv,return_counts=True)
vocabulary = {}
for i in range(0, len(unik)):
    vocabulary[unik[i]] = counts[i]

In [10]:
word_list = list([x[0] for x in Counter(vocabulary).most_common(nb_input-1)])

# We add the unk word for future purpose.
word_list.append('<unk>')
words_array = np.array(word_list)
print("Vocabulary contains", len(words_array), "words.")

Vocabulary contains 100 words.


In [11]:
# word_list

In [12]:
# Integer encoding with OneHotEncoder
words_tre = words_array.reshape(len(words_array),1)
one_hot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = one_hot_encoder.fit_transform(words_tre)
#print(onehot_encoded)

In [13]:
# Creating a dictionnary of word and its one hot array
words_onehoted = {}
for i in range(0, len(words_array)):
    words_onehoted[word_list[i]] = onehot_encoded[i]

# Function to get the corresponding one hot list for a word.
def get_onehot_word(word):
    if word in words_onehoted:
        return list(words_onehoted[word])
    else:
        return list(words_onehoted['<unk>'])

In [14]:
oh = get_onehot_word('<unk>')
one = oh.index(1.0)
one

8

In [15]:
# Testing if an unknown word is transformed into a <unk>
get_onehot_word('obviously_an_unknown_word').index(1.0)

8

In [16]:
# One hot categories
categories_onehoted = {}
categories_onehoted['ABBR'] = [1, 0, 0, 0, 0, 0] # Abbreviation
categories_onehoted['ENTY'] = [0, 1, 0, 0, 0, 0] # Entity
categories_onehoted['DESC'] = [0, 0, 1, 0, 0, 0] # Description
categories_onehoted['HUM']  = [0, 0, 0, 1, 0, 0] # Human
categories_onehoted['LOC']  = [0, 0, 0, 0, 1, 0] # Location
categories_onehoted['NUM']  = [0, 0, 0, 0, 0, 1] # Numeric

# Function to get the corresponding one hot list for a category.
def get_onehot_category(category):
    return categories_onehoted[category]

In [17]:
oh = get_onehot_category('HUM')
one = oh.index(1.0)
one

3

In [18]:
%%time
# Creating training set

batch_data = []
for num_question in range(len(questions)):
    # Construction of question_onehot list.
    question_onehot = [get_onehot_word(word) for word in questions[num_question]]
    
    # Construction of category_onehot.
    category = labels[num_question].partition(':')[0]
    category_onehot = get_onehot_category(category)
    batch_data.append([tensor([question_onehot]), tensor([category_onehot])])    
print(len(batch_data))
# Creating test set

batch_data_test = []
for num_question in range(len(questions_test)):
    
    # Construction of question_onehot list.
    question_onehot = [get_onehot_word(word) for word in questions[num_question]]
    
    # Construction of category_onehot.
    category = labels_test[num_question].partition(':')[0]
    category_onehot = get_onehot_category(category)
    batch_data_test.append([tensor([question_onehot]), tensor([category_onehot])])
print(len(batch_data_test))

15452
500
CPU times: user 1.61 s, sys: 34.5 ms, total: 1.64 s
Wall time: 1.64 s


In [19]:
# Dev / Train
batch_dev = batch_data[10000:]
batch_train = batch_data[:10000]

# RNN implementation
Using ReLU, and CrossEntropy

In [26]:
class RNN(nn.Module):
    def __init__(self, nb_inputs, nb_layers, nb_neurons, nb_outputs, learning_rate):
        super(RNN, self).__init__()
        
        # Applying RNN layer, and softmax then
        self.rnn = nn.RNN(input_size=nb_inputs, num_layers=nb_layers,
                   hidden_size=nb_neurons, dropout=0., batch_first=True, nonlinearity='relu')
        self.inter = nn.Linear(nb_neurons, nb_outputs)
        self.sm = nn.Softmax(dim=2)
        
        # Other usefull variables here
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        self.input_dim = nb_inputs
        self.output_dim = nb_output
        self.nb_layers = nb_layers
        self.nb_neurons = nb_neurons
        
    def forward(self, inputs):
        h0 = Variable(torch.zeros(self.nb_layers, inputs.size(0), self.nb_neurons))
        if use_cuda:
            h0 = h0.to("cuda")
        x, hn = self.rnn(inputs, h0)
        
        x = self.inter(x)
        x = self.sm(x)
        return x

# End of the class RNN

# Now let's define learn(), which learn a RNN some data
def learn(rnn, batch_list, num_epochs=1):
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
        rnn.cuda(device)
    
    # Preparing
    rnn.train()
    losses = []
    criterion = nn.CrossEntropyLoss()
    

    for epoch in range(num_epochs):
        # Shuffling batch_list
        shuffle(batch_list)
        
        for batch_idx, (data, target) in enumerate(batch_list):
            data, target = data.to(device), target.to(device)
            output = rnn(data)
            loss = criterion(output, target)
            losses.append(loss.data.item())
            rnn.optimizer.zero_grad()
            loss.backward()
            rnn.optimizer.step()
            
            # Print the progress
            if batch_idx % 100 == 0 or batch_idx % 100 == 1 or batch_idx == len(batch_list)-1:
                print('\r Train Epoch: {} [{}/{} ({:.0f}%)]\t Loss: {:.6f}'.format(
                        epoch, 
                        (batch_idx+1) * len(data), 
                        len(batch_list),
                        100. * (batch_idx+1) / len(batch_list), 
                        loss.data.item()), 
                        end='')
        print()
        
    # Return losses list, you can print them later if you want
    return losses


# return (rightAnswer, ignored, falseAnswer)
def getEfficience(rnn, batch_list, tresh=0) :
    rightAnswer = 0
    ignored = 0
    falseAnswer = 0
    
    device = torch.device("cuda" if use_cuda else "cpu")
    for (data, target) in batch_list :
        data, target = data.to(device), target.to(device)
        predicted = rnn(data).detach().cpu().numpy()[-1][-1]
        #print("predicted: "+str(np.argmax(predicted)))
        #print("target: "+str(np.argmax(target.detach().cpu().numpy()[-1])))
        if max(predicted) < tresh :
            ignored += 1
        else:
            if np.argmax(predicted) == np.argmax(target.detach().cpu().numpy()[-1]):
                rightAnswer += 1
            else:
                falseAnswer += 1
    return (rightAnswer, ignored, falseAnswer)

### LSTM

In [21]:
class LSTM(nn.Module):
    def __init__(self, nb_input, nb_output, nb_hidd_lay, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = nb_hidd_lay
        self.lstm = nn.LSTM(nb_input,hidden_size,nb_hidd_lay, batch_first=True)
        self.fc = nn.Linear(hidden_size, nb_output)
    
    def forward(self,inputs):
        h0 = Variable(torch.zeros(self.nb_hidd_lay, inputs.size(0), self.hidden_size))
        c0 = Variable(torch.zeros(self.nb_hidd_lay, inputs.size(0), self.hidden_size))
        
        out, _ = self.lstm(inputs,(h0,c0))
        out = self.fc(out[:,-1,:])
        return out

# Using the RNN

In [22]:
random.seed(random_seed)
torch.manual_seed(random_seed)
if use_cuda:
    torch.cuda.manual_seed_all(random_seed)
    torch.cuda.manual_seed(random_seed)

rnn = RNN(nb_inputs = nb_input, nb_layers=nb_hidd_lay,
          nb_neurons=hidden_size,nb_outputs=nb_output, learning_rate=lr)
if use_cuda:
    rnn = rnn.to("cuda")
losses = learn(rnn, batch_dev, nb_epochs)
print("Done :)")
print('-------------------')
losses_train = learn(rnn, batch_train, nb_epochs)
print("Done")

Done :)
-------------------
Done


## Error curve

In [24]:
from scipy.signal import savgol_filter
import ipywidgets as widgets
from ipywidgets import interact
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

x = np.linspace(0, len(losses), len(losses))
fig = plt.figure(figsize=(13, 8)) 
ax = fig.add_subplot(1,1,1)
cnn_line, = ax.plot(x, losses)

def update_losses(smooth=51):
    cnn_line.set_ydata(savgol_filter(losses, smooth, 3))
    fig.canvas.draw()

interact(update_losses, smooth=(5, 201, 2));

<Figure size 1300x800 with 1 Axes>

interactive(children=(IntSlider(value=51, description='smooth', max=201, min=5, step=2), Output()), _dom_class…

# Analysis on test set

In [25]:
random.seed(random_seed)
torch.manual_seed(random_seed)
if use_cuda:
    torch.cuda.manual_seed_all(random_seed)
    torch.cuda.manual_seed(random_seed)

rnn.eval()

final_results = getEfficience(rnn, batch_data)
total = sum(final_results)

correct = final_results[0]/total*100
ignored = final_results[1]/total*100
false = final_results[2]/total*100



print("Congratulations! On the training set:")
print("Corrects: " + str(correct) + "%")
#print("Ignored:  " + str(ignored) + "%")
print("False:    " + str(false) + "%")
print()

final_results = getEfficience(rnn, batch_data_test)
total = sum(final_results)

correct = final_results[0]/total*100
ignored = final_results[1]/total*100
false = final_results[2]/total*100



print("Congratulations! On the test set:")
print("Corrects: " + str(correct) + "%")
#print("Ignored:  " + str(ignored) + "%")
print("False:    " + str(false) + "%")

print()
print("A présent, tu peux copier-coller ça dans le doc sur le drive :)")
print(str(nb_input)+"\t"+str(lr)+"\t"+str(nb_epochs)+"\t"+str(nb_hidd_lay)+"\t"+str(hidden_size)+"\t\t"+str(correct)+"%")
print()

Congratulations! On the training set:
Corrects: 1.5596686513072742%
False:    98.44033134869272%

Congratulations! On the test set:
Corrects: 1.7999999999999998%
False:    98.2%

A présent, tu peux copier-coller ça dans le doc sur le drive :)
100	0.001	10	1	1		1.7999999999999998%

