In [1]:
import torch
import numpy as np
import pandas as pd
import codecs
import re
import nltk
import random

from nltk.stem import WordNetLemmatizer

from random import shuffle

from collections import Counter

from numpy import array

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

from torch import tensor
from torch import nn
from torch import optim
from torch.autograd import Variable
import torch.utils.data.dataloader as dataloader
from torch.utils.data import Dataset

from scipy.stats import entropy


from scipy.signal import savgol_filter
import ipywidgets as widgets
from ipywidgets import interact
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

# Parameters

In [2]:
# Number of recognized words you put in input
nb_input = 1700 # write -1 if you want every words

# Number of classe, constant
nb_output = 6

# Number of hidden layers
nb_hidd_lay = 8
hidden_size = 10

# Learning rate
lr = 0.001

# Number of epochs
nb_epochs = 80

# Random seed, don't change it if you don't know what it is
random_seed = 42

nb_batchs = 16

# How many percent of your data do you use as training set
devLine = 0.7

use_cuda = torch.cuda.is_available()

# If your goal is to draw graphs
great_analysis = False

In [3]:
def seeding_random():
    random.seed(random_seed)
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)
    if use_cuda:
        torch.cuda.manual_seed_all(random_seed)
        torch.cuda.manual_seed(random_seed)

torch.backends.cudnn.deterministic=True

## Dataloader implementation

In [4]:
class QuestionDataset(Dataset):
    
    # Special constructor
    # | nb_most_commons can either be the number of most common words you
    # | want to work with, OR a list of word you want to work with
    # If nb_most_commons == -1, then all word will count
    
    def __init__(self, train_data, nb_most_commons=-1):
        questions = []
        labels = []

        # Black list
        black_list = '\'`[@_!#$%^&*()<>?/\|}{~:]'
        
        for string in train_data:
            question_str = []
            for x in string.split()[1:]:
                s = ""
                for c in x:
                    if not c in black_list:
                        s += c
                if not s == "":
                    question_str.append(s.lower())
                        
            labels.append(string.split()[0])
            questions.append(question_str)

        
        if isinstance(nb_most_commons, int):
            # Vocabulary of unique words
            data = []
            for q in questions:
                for w in q:
                    data.append(w)
            self.reparti_word = Counter(data)
            
            if nb_most_commons < 0:
                most_commons_words = self.reparti_word.most_common(len(data))
            else:
                most_commons_words = self.reparti_word.most_common(nb_most_commons)
            
            self.word_list = list([x[0] for x in most_commons_words])
            self.word_list.append('<bos>')
            self.word_list.append('<eos>')
            self.word_list.append('<unk>')
        elif isinstance(nb_most_commons, list):
            self.word_list = nb_most_commons
        else:
            print("ERROR: second arg is neither an int, nor a list")
            
        words_array = np.array(self.word_list)
        
        # Add tags <bos> and <eos> to questions
        for q in questions:
            if q[0] != '<bos>' :
                q.insert(0, '<bos>')
                q.append('<eos>')

        # Integer encoding with OneHotEncoder
        words_tre = words_array.reshape(len(words_array),1)
        one_hot_encoder = OneHotEncoder(sparse=False)
        onehot_encoded = one_hot_encoder.fit_transform(words_tre)
        # Creating a dictionnary of word and its one hot array
        self.words_onehoted = {}
        for i in range(0, len(words_array)):
            self.words_onehoted[self.word_list[i]] = onehot_encoded[i]

        # One hot categories
        self.categories_num = {}
        self.categories_num['ABBR'] = 0 # Abbreviation
        self.categories_num['ENTY'] = 1 # Entity
        self.categories_num['DESC'] = 2 # Description
        self.categories_num['HUM']  = 3 # Human
        self.categories_num['LOC']  = 4 # Location
        self.categories_num['NUM']  = 5 # Numeric

        self.batch_data = []
        for num_question in range(len(questions)):
            # Construction of question_onehot list.
            question_onehot = [self.get_onehot_word(word) for word in questions[num_question]]

            # Construction of category_onehot.
            category = labels[num_question].partition(':')[0]
            category_onehot = self.get_num_category(category)
            self.batch_data.append([(question_onehot), (category_onehot)])
        
    
    # Function to get the corresponding one hot list for a category.
    def get_num_category(self, category):
        return self.categories_num[category]


    # Function to get the corresponding one hot list for a word.
    def get_onehot_word(self, word):
        if word in self.words_onehoted:
            return list(self.words_onehoted[word])
        else:
            return list(self.words_onehoted['<unk>'])

                
    def __len__(self):
        return len(self.batch_data)

    def __getitem__(self, idx):
        seeding_random()
        return self.batch_data[idx]
    
def pad_collate(batch):
    max_length = max([len(q[0]) for q in batch])

    inputs = torch.FloatTensor([[[0. for _ in range(len(x[0][0]))] for i in range(max_length-len(x[0]))]+x[0] for x in batch])
    outputs = torch.LongTensor([x[1] for x in batch])
    
    return inputs, outputs
    

In [5]:

seeding_random()

# Encoding in windows-1252, utf-8 generate error on some char
file = codecs.open("train_all.label", "r+","windows-1252")
data = []
for line in file.readlines():
    data.append(line)
train_data = data[:round(len(data)*devLine)]
dev_data = data[round(len(data)*devLine):]

print("Création training set...")
training_set = QuestionDataset(train_data, nb_input-3)

print("Done!")

print("Création dev set...")
dev_set = QuestionDataset(dev_data, training_set.word_list)
seeding_random()

print("Done!")

print("Création test set...")
file = codecs.open("TREC_test.label", "r+","windows-1252")
test_data = []
for line in file.readlines():
    test_data.append(line)
test_set = QuestionDataset(test_data, training_set.word_list)
seeding_random()

# Création du DataLoader
dataloader_args = dict(shuffle=True, batch_size=nb_batchs, num_workers=1,
                       pin_memory=True, worker_init_fn=seeding_random(), collate_fn=pad_collate)
seeding_random()

train_loader = dataloader.DataLoader(training_set, **dataloader_args)
seeding_random()

dataloader_args_notshuffle = dict(shuffle=False, batch_size=nb_batchs, num_workers=1,
                       pin_memory=True, worker_init_fn=seeding_random(), collate_fn=pad_collate)

dev_loader = dataloader.DataLoader(dev_set, **dataloader_args)
seeding_random()

test_loader = dataloader.DataLoader(test_set, **dataloader_args_notshuffle)
seeding_random()
print("Done!")

print("List of word used:")
print(training_set.word_list)


Création training set...
Done!
Création dev set...
Done!
Création test set...
Done!
List of word used:
['the', 'what', 'is', 'of', 'in', 'a', 'how', 's', 'was', 'who', 'to', ',', 'are', 'for', 'and', 'did', 'does', 'do', 'name', 'on', 'many', 'where', 'i', 'you', 'can', 'first', 'when', 'from', 'which', 'world', 'that', 'city', 'as', 'with', 'country', 'has', 'most', '.', 'u.s.', 'by', 'an', 'have', 'find', 'it', 'why', 'there', 'get', 'people', 'called', 'state', 'year', 'were', 'mean', 'be', 'american', 'president', 'largest', 'his', 'fear', 'two', 'at', 'war', 'new', 'its', 'origin', 'word', 'much', 'about', 'known', 'kind', 'company', 'between', 'game', 'film', 'long', 'movie', 'day', 'live', 'made', 'your', 'or', 'take', 'only', 'stand', 'man', 'best', 'book', 'tv', 'their', 'one', 'john', 'famous', 'all', 'color', 'show', 'star', 'term', 'he', 'used', 'my', 'out', 'play', 'come', 'baseball', 'invented', 'had', 'into', 'call', 'number', 'countries', 'make', 'home', 'dog', 'time', 

# Repartition per classe

In [6]:
if great_analysis:
    classes = [0,0,0,0,0,0]
    for data, target in train_loader:
        for t in list(target):
            t = t.item()
            classes[t] += 1

    total = sum(classes)
    rep_classes = [c/total*100 for c in classes]
    print("Répartitions des données dans les classes:")
    for i in range(len(rep_classes)):
        print("Classe numéro " + str(i+1) + ": " + str(rep_classes[i]) + "%")

## Word occurence repartition

In [7]:
if great_analysis:

    word_occ = training_set.reparti_word
    word_occ = dict(word_occ)
    
    total = sum([value for key, value in training_set.reparti_word.most_common(len(training_set.reparti_word))])
    
    values = [sum([value for key, value in training_set.reparti_word.most_common(i+1)])/total*100 for i in range(len(training_set.reparti_word))]

    x = np.linspace(0, len(values), len(values))
    fig = plt.figure(figsize=(13, 8)) 
    ax = fig.add_subplot(1,1,1)
    cnn_line, = ax.plot(x, values)

    ax.set(xlabel="Vocabulaire unique", ylabel="Couverture en %")


# RNN implementation
Using ReLU, and CrossEntropy

In [8]:
class RNN(nn.Module):
    def __init__(self, nb_inputs, nb_layers, nb_neurons, nb_outputs, learning_rate):
        super(RNN, self).__init__()
        
        # Applying RNN layer, and softmax then
        self.rnn = nn.RNN(input_size=nb_inputs, num_layers=nb_layers,
                   hidden_size=nb_neurons, dropout=0., batch_first=True, nonlinearity='relu')
        self.inter = nn.Linear(nb_neurons, nb_outputs)
        self.sm = nn.Softmax(dim=1)
        
        # Other usefull variables here
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        self.input_dim = nb_inputs
        self.output_dim = nb_output
        self.nb_layers = nb_layers
        self.nb_neurons = nb_neurons
        
    def forward(self, inputs):
        h0 = torch.zeros(self.nb_layers, inputs.size(0), self.nb_neurons)
        if use_cuda:
            h0 = h0.to("cuda")
        x, hn = self.rnn(inputs, h0)
        
        x = self.inter(hn[0])
        #print(x)
        #x = tensor([list(i[-1]) for i in x])
        #print(x)
        x = self.sm(x)
        return x

# End of the class RNN

#TODO
#Entropy mean might be near to zero
def getEntropies(rnn, batch_list):
    entropy_list = []
    #value, counts = np.unique(out, return_counts=True)
    #entropy_list.append(entropy(out, base=None))
    return [-1]


# return correct_percent
def getEfficience(rnn, batch_list) :
    total_correct = 0
    total = 0
    device = torch.device("cuda" if use_cuda else "cpu")
    for (data, target) in batch_list :
        data, target = data.to(device), target.to(device)
        out = rnn(data).data
        
        _, predicted = torch.max(out.data, dim=1)
        total_correct += (predicted == target).sum().item()
        total += target.size(0)

    return total_correct / total

# Now let's define learn(), which learn a RNN some data
def learn(rnn, data_loader, num_epochs=1):
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
        rnn.cuda(device)
    
    # Preparing
    rnn.train()
    losses_train = []
    losses_dev = []
    criterion = nn.CrossEntropyLoss()
    
    best_rnn = rnn
    max_acc_dev = -1
    pos_best_rnn = 0;
    

    for epoch in range(num_epochs):
        total_correct = 0
        total_target = 0
        
        for batch_idx, (data, target) in enumerate(data_loader):
            #rnn.train()
            seeding_random()
            data, target = data.to(device), target.to(device)
            
            output = rnn(data)
            
            loss = criterion(output, target)
            rnn.optimizer.zero_grad()
            loss.backward()
            rnn.optimizer.step()
            
            # Get the Accuracy
            
            _, predicted = torch.max(output.data, dim=1)
            correct = (predicted == target).sum().item()
            total_correct += correct
            total_target += target.size(0)
            
            # Print the progress
            if batch_idx % 500 == 0 or batch_idx % 500 == 1 or batch_idx == len(data_loader)-1:
                print('\r Train Epoch: {}/{} [{}/{} ({:.0f}%)]\t Loss: {:.6f}\t Accuracy: {}'.format(
                    epoch+1,
                    num_epochs,
                    batch_idx * len(data), 
                    len(data_loader.dataset),
                    100. * batch_idx / len(data_loader), 
                    loss.data.item(),
                    (total_correct / total_target) * 100),
                    end='')
                losses_train.append(loss.data.item())
                if great_analysis:
                    dev_data, dev_target = next(iter(dev_loader))
                    dev_data, dev_target = dev_data.to(device), dev_target.to(device)
                    output = rnn(dev_data)
                    loss = criterion(output, dev_target)
                    losses_dev.append(loss.data.item())
                    
                    
        print()
        acc_dev = getEfficience(rnn, dev_loader)*100
        if acc_dev > max_acc_dev:
            max_acc_dev = acc_dev
            best_rnn = rnn
            pos_best_rnn = epoch
        
        print("Dev set: accuracy: " + str(acc_dev) + "% | max acc: " + str(max_acc_dev)+"%")
        print()
    rnn = best_rnn
    # Return losses list, you can print them later if you want
    return {"losses_train":losses_train, "losses_dev":losses_dev, "pos_best":pos_best_rnn+1, "best_rnn":best_rnn}


# Using the RNN

In [9]:
import datetime

seeding_random()

rnn = RNN(nb_inputs = len(training_set.word_list), nb_layers=nb_hidd_lay,
          nb_neurons=hidden_size, nb_outputs=nb_output, learning_rate=lr)
if use_cuda:
    rnn = rnn.to("cuda")

seeding_random()

begin_time = datetime.datetime.now()

with torch.enable_grad():
    job = learn(rnn, train_loader, nb_epochs)
    losses_train = job["losses_train"]
    losses_dev = job["losses_dev"]
    pos_best_rnn = job["pos_best"]
    rnn = job["best_rnn"]
    print("Done :)")
    
end_time = datetime.datetime.now()
print("Learned in " + str(end_time - begin_time))

Dev set: accuracy: 30.97497842968076% | max acc: 30.97497842968076%

Dev set: accuracy: 49.13718723037101% | max acc: 49.13718723037101%

Dev set: accuracy: 59.98705780845557% | max acc: 59.98705780845557%

Dev set: accuracy: 63.114754098360656% | max acc: 63.114754098360656%

Dev set: accuracy: 70.66436583261432% | max acc: 70.66436583261432%

Dev set: accuracy: 74.5685936151855% | max acc: 74.5685936151855%

Dev set: accuracy: 77.24331320103538% | max acc: 77.24331320103538%

Dev set: accuracy: 76.03537532355479% | max acc: 77.24331320103538%

Dev set: accuracy: 76.33735979292493% | max acc: 77.24331320103538%

Dev set: accuracy: 79.05522001725626% | max acc: 79.05522001725626%

Dev set: accuracy: 75.69025021570319% | max acc: 79.05522001725626%

Dev set: accuracy: 76.79033649698016% | max acc: 79.05522001725626%

Dev set: accuracy: 80.88869715271785% | max acc: 80.88869715271785%

Dev set: accuracy: 81.4495254529767% | max acc: 81.4495254529767%

Dev set: accuracy: 80.52200172562553

Dev set: accuracy: 76.74719585849871% | max acc: 85.35375323554788%

Dev set: accuracy: 84.01639344262296% | max acc: 85.35375323554788%

Dev set: accuracy: 84.55565142364107% | max acc: 85.35375323554788%

Dev set: accuracy: 84.59879206212251% | max acc: 85.35375323554788%

Dev set: accuracy: 85.22433132010354% | max acc: 85.35375323554788%

Dev set: accuracy: 84.4909404659189% | max acc: 85.35375323554788%

Dev set: accuracy: 85.46160483175152% | max acc: 85.46160483175152%

Dev set: accuracy: 82.87316652286454% | max acc: 85.46160483175152%

Dev set: accuracy: 54.68075927523728% | max acc: 85.46160483175152%

Dev set: accuracy: 85.67730802415876% | max acc: 85.67730802415876%

Dev set: accuracy: 84.68507333908542% | max acc: 85.67730802415876%

Dev set: accuracy: 84.8360655737705% | max acc: 85.67730802415876%

Dev set: accuracy: 84.94391716997411% | max acc: 85.67730802415876%

Dev set: accuracy: 85.15962036238136% | max acc: 85.67730802415876%

Dev set: accuracy: 85.31061259706644

## Error curve

In [10]:
def update_losses(smooth=1):
    x_train = np.linspace(0, len(losses_train), len(losses_train))
    fig = plt.figure(figsize=(13, 8)) 
    ax_train = fig.add_subplot(1,1,1)
    cnn_line_train, = ax_train.plot(x_train, losses_train)
    cnn_line_train.set_ydata(savgol_filter(losses_train, smooth, 3))
    
    if great_analysis:
        x_dev = np.linspace(0, len(losses_dev), len(losses_dev))
        ax_dev = fig.add_subplot(1,1,1)
        cnn_line_dev, = ax_dev.plot(x_dev, losses_dev)
        cnn_line_dev.set_ydata(savgol_filter(losses_dev, smooth, 3))
    
interact(update_losses, smooth=(5, 500, 2));


interactive(children=(IntSlider(value=5, description='smooth', max=500, min=5, step=2), Output()), _dom_classe…

# Analysis on test set

In [11]:

print("Congratulations!")

rnn.eval()

seeding_random()

correct_train = getEfficience(rnn, train_loader)*100

print("On the training set:")
print("Corrects: " + str(correct_train) + "%")
print()

seeding_random()

correct_dev = getEfficience(rnn, dev_loader)*100

print("On the dev set:")
print("Corrects: " + str(correct_dev) + "%")
print()

seeding_random()

correct_test = getEfficience(rnn, test_loader)*100

mean_entropies = -1

print("On the test set:")
print("Moyenne des entropies: " + str(mean_entropies))
print("Corrects: " + str(correct_test) + "%")

print()

inputs = nb_input
if inputs == -1:
    inputs = len(training_set.word_list)-3

print("A présent, tu peux copier-coller ça dans le doc sur le drive :)")
print(str(inputs)+"\t"+str(lr)+"\t"+str(nb_epochs)+"\t"+str(nb_hidd_lay)
      +"\t"+str(hidden_size)+"\t"+str(nb_batchs)+"\t\t"+str(mean_entropies)+"\t"+str(pos_best_rnn)
      +"\t"+str(correct_train)+"%\t"+str(correct_dev)+"%\t"+str(correct_test)+"%")
print()



Congratulations!
On the training set:
Corrects: 94.50813609467455%

On the dev set:
Corrects: 85.61259706643658%

On the test set:
Moyenne des entropies: -1
Corrects: 79.4%

A présent, tu peux copier-coller ça dans le doc sur le drive :)
1700	0.001	80	8	10	16		-1	70	94.50813609467455%	85.61259706643658%	79.4%

