In [30]:
import numpy as np
import pickle as pkl

encoding = 'utf-8'
path_to_data = './data/'
#Ouverture du fichier de sms
path = path_to_data+"dataEmbeded.pkl"

with open(path, 'rb') as pickler:
    data = pkl.load(pickler)

# Tokenize

In [31]:
from utils.tokenizer import tokenize_corpus

processed_data = tokenize_corpus(data[:,0], stop_words = False, BoW = True)

In [39]:
import pickle

chunk_size = 5000
count = 0

nb_chunks = int(processed_data.shape[0]*0.8)//chunk_size

for chunk_nb in range(nb_chunks):
    path = './data/train-'+ str(chunk_nb) +'.npy'
    start = nb_chunks * chunk_nb
    end =  nb_chunks * (chunk_nb + 1) if (chunk_nb + 1) < nb_chunks else -1
    with open(path, 'wb') as file:
        pickle.dump(processed_data[start:end], file)
        
nb_chunks = int(processed_data.shape[0]*0.2)//chunk_size

for chunk_nb in range(nb_chunks):
    path = './data/test-'+ str(chunk_nb) +'.npy'
    start = nb_chunks * chunk_nb
    end =  nb_chunks * (chunk_nb + 1) if (chunk_nb + 1) < nb_chunks else -1
    with open(path, 'wb') as file:
        pickle.dump(processed_data[start:end], file)

# Model

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
use_cuda = torch.cuda.is_available()

print('Using cuda device '+torch.cuda.get_device_name(0) if use_cuda else 'Not using cuda')

Using cuda device GeForce GTX 1050 Ti with Max-Q Design


In [2]:
import pickle

path = './data/processed_data.npy'
with open(path, 'rb') as file:
    processed_data = pickle.load(file).toarray()

In [3]:
class batchify:
    def __init__(self, data, bsz, training=False, split=0.8):
        data = data[int(len(data)*split):] if training else data[:int(len(data)*split)]
        self.training = training
        self.batches = []
        batch = []
        for line in data:
            if len(batch) != bsz:
                batch.append(line)
            else:
                self.batches.append(batch)
                batch = []
                
    def __len__(self):
        return len(self.batches)

    def __getitem__(self, index):
        input_variable = Variable(torch.FloatTensor(self.batches[index]), volatile=self.training)
        target_variable = Variable(torch.FloatTensor(self.batches[index]), volatile=self.training)
        return (input_variable, target_variable)

In [4]:
class GaussianNoise(nn.Module):
    def __init__(self, sigma=0.1, is_relative_detach=True):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        self.noise = torch.tensor(0).cuda()

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale
            x = x + sampled_noise
        return x 

In [5]:
class encoder(nn.Module):
    def __init__(self, input_size, noise_sigma=0.3, 
                 layers=(100,50), dropout=0.5, batch_norm=True):
        super().__init__()
        self.noiseLayer = GaussianNoise(sigma=noise_sigma)
        
        self.layers = []
        for i, size in enumerate(layers[:-1]):
            if i == 0:
                self.layers.append(nn.Linear(input_size, layers[i+1]))
            else:
                self.layers.append(nn.Linear(size, layers[i+1]))
            self.layers[-1].bias.data.fill_(0)
            self.layers[-1].weight.data.uniform_(-0.1, 0.1)
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(p=dropout))
            if batch_norm:
                self.layers.append(nn.BatchNorm1d(layers[i+1]))
        self.layers = torch.nn.ModuleList(self.layers)
            
        
    def forward(self, input):
        output = input
        for layer in self.layers:
            output = layer(output)
        return output
        

In [6]:
class decoder(nn.Module):
    def __init__(self, output_size, layers=(50,100,100), dropout=0.5, batch_norm=True):
        super().__init__()
        self.layers = []
        for i, size in enumerate(layers[:-1]):
            self.layers.append(nn.Linear(size, layers[i+1]))
            self.layers[-1].bias.data.fill_(0)
            self.layers[-1].weight.data.uniform_(-0.1, 0.1)
            self.layers.append(nn.ReLU())
            if batch_norm:
                self.layers.append(nn.BatchNorm1d(layers[i+1]))
            self.layers.append(nn.Dropout(p=dropout))
        self.layers.append(nn.Linear(layers[-1], output_size))
        self.layers[-1].bias.data.fill_(0)
        self.layers[-1].weight.data.uniform_(-0.1, 0.1)
        self.layers = torch.nn.ModuleList(self.layers)
        
    def forward(self, input):
        output = input
        for layer in self.layers:
            output = layer(output)
        return output

In [7]:
def trainIters(X, encoder, decoder, n_epochs, iter=1, start_epoch=1, local_save_every=1000, print_every=1000, plot_every=1000, save_every=5000, batch_size=64, lr=0.5, lambda_u=0.1):
    start = time.time()
    total_iter = 0
    best_loss = 99999

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=lr, weight_decay=lambda_u)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=lr, weight_decay=lambda_u)
    
    training_generator = batchify(X, batch_size)
    test_generator = batchify(X, batch_size, training=False)
    
    no_improv = 0
    
    for epoch in range(n_epochs):
        try:
            plot_losses = []
            print_loss_total = []  # Reset every print_every
            plot_loss_total = []  # Reset every plot_every
            save_loss_total = []

            encoder.train()
            decoder.train()

            for input_variable, target_variable in training_generator:
                if use_cuda:
                    input_variable = input_variable.cuda()
                    target_variable = target_variable.cuda()
                
                loss = model_wraper(input_variable, target_variable, 
                                                  encoder, encoder_optimizer, decoder, decoder_optimizer, 
                                                  train=True)
                print(loss)
                print_loss_total.append(loss)
                plot_loss_total.append(loss)
                save_loss_total.append(loss)

                if iter % print_every == 0 or iter == len(training_generator):
                    print_loss_avg = np.mean(print_loss_total)
                    print_loss_total = []
                    print('%s (%6.f %3.f%%) | Training loss: %.4f' % (timeSince(start, total_iter / len(batches) / n_epochs),
                                                 iter, iter / len(ttraining_generator) * 100, print_loss_avg))

#                 if iter % plot_every == 0:
#                     plot_loss_avg = np.mean(plot_loss_total)
#                     train_losses = load_loss(hp.name, 'train')
#                     train_losses.append(plot_loss_avg)
#                     save_loss(hp.name, 'train', train_losses)
#                     plot_loss_total = []
                iter += 1
                total_iter += 1

                if iter >= len(training_generator):
                    break
            encoder.eval()
            decoder.eval()

            iter=1
            print_loss_total = []
            t0 = time.time()
            for input_variable, target_variable in test_generator:
                batch_size = input_variable.size(0)
                
                if use_cuda:
                    input_variable = input_variable.cuda()
                    target_variable = target_variable.cuda()
                    
                loss = model_wraper(input_variable, target_variable, 
                                                  encoder, encoder_optimizer, decoder, decoder_optimizer, 
                                                  train=False)
                
                print_loss_total.append(loss)
                if iter % len(test_generator) == 0:
                    tf = time.time()
                    print_loss_avg = np.mean(print_loss_total)
                    print_loss_total = []
                    print('Validation loss: %2.4f | Validation ppl: %2.4f | Time/sample: %dms' % (print_loss_avg, np.exp(print_loss_avg), int((tf-t0)/len(test_generator)/batch_size*1000)))
                    
#                     valid_losses = load_loss(hp.name, 'valid')
#                     if len(valid_losses) < 2:
#                         save_model(hp.name, encoder, decoder, iter)
#                     else:
#                         if min(valid_losses[-2:]) < print_loss_avg :
#                             no_improv += 1
#                         else:
#                             no_improv = 0
#                             save_model(hp.name, encoder, decoder, iter)
#                             print('Model Saved')
#                     if no_improv > 1:
#                         lr = encoder_optimizer.param_groups[0]['lr']
#                         encoder_optimizer.param_groups[0]['lr'] = lr / 2
#                         decoder_optimizer.param_groups[0]['lr'] = lr / 2
#                         print('No Improvement for 2 epoch, dividing the learning rate by 2')
                    valid_losses.append(print_loss_avg)
                    save_loss(hp.name, 'valid', valid_losses)
                iter += 1
                total_iter += 1
            iter=1
            
            if epoch+start_epoch >= n_epochs:
                break
        except KeyboardInterrupt:
            print('User stopped training')
            break

In [8]:
def model_wraper(input_variable, target_variable, encoder, encoder_optimizer, decoder, decoder_optimizer, train=False):
    
    batch_size = input_variable.size(1)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    loss = 0
    
    encoder_outputs = encoder(input_variable)
    decoder_outputs = decoder(encoder_outputs)

    raw_loss = F.mse_loss(decoder_outputs, target_variable)

    loss = raw_loss
    
    loss /= batch_size
  
    if train:
        loss.backward()
        
#         torch.nn.utils.clip_grad_norm(encoder.parameters(), grad_clip)
        encoder_optimizer.step()
        
#         torch.nn.utils.clip_grad_norm(decoder.parameters(), grad_clip)
        decoder_optimizer.step()

    return raw_loss.data[0] 

In [9]:
encoder1 = encoder(input_size = processed_data.shape[1])
decoder1 = decoder(output_size = processed_data.shape[1])

if use_cuda:
    encoder1 = encoder1.cuda()
    decoder1 = decoder1.cuda()

In [None]:
import time
from torch.utils.data import Dataset, DataLoader

start_iter = 1
print('starting from',start_iter)
trainIters(processed_data, encoder1, decoder1, 10, iter=start_iter)

starting from 1




tensor(2.7567e-06, device='cuda:0')
tensor(2.3361e-06, device='cuda:0')
tensor(2.0445e-06, device='cuda:0')
tensor(1.5858e-06, device='cuda:0')
tensor(1.2359e-06, device='cuda:0')
tensor(1.0790e-06, device='cuda:0')
tensor(8.5497e-07, device='cuda:0')
tensor(7.0190e-07, device='cuda:0')
tensor(6.0803e-07, device='cuda:0')
tensor(4.8325e-07, device='cuda:0')
tensor(3.7776e-07, device='cuda:0')
tensor(3.2896e-07, device='cuda:0')
tensor(2.6589e-07, device='cuda:0')
tensor(2.0873e-07, device='cuda:0')
tensor(1.8120e-07, device='cuda:0')
tensor(1.3184e-07, device='cuda:0')
tensor(1.1424e-07, device='cuda:0')
tensor(1.0119e-07, device='cuda:0')
tensor(7.6113e-08, device='cuda:0')
tensor(6.5406e-08, device='cuda:0')
tensor(5.6753e-08, device='cuda:0')
tensor(4.7817e-08, device='cuda:0')
tensor(4.2576e-08, device='cuda:0')
tensor(3.6544e-08, device='cuda:0')
tensor(2.8435e-08, device='cuda:0')
tensor(2.5011e-08, device='cuda:0')
tensor(2.1137e-08, device='cuda:0')
tensor(1.9599e-08, device='c

tensor(7.1141e-09, device='cuda:0')
tensor(7.7283e-09, device='cuda:0')
tensor(9.9402e-09, device='cuda:0')
tensor(9.2609e-09, device='cuda:0')
tensor(8.3712e-09, device='cuda:0')
tensor(8.7692e-09, device='cuda:0')
tensor(8.8534e-09, device='cuda:0')
tensor(7.4413e-09, device='cuda:0')
tensor(7.4126e-09, device='cuda:0')
tensor(8.5319e-09, device='cuda:0')
tensor(9.3183e-09, device='cuda:0')
tensor(1.0344e-08, device='cuda:0')
tensor(9.2801e-09, device='cuda:0')
tensor(8.8056e-09, device='cuda:0')
tensor(8.4516e-09, device='cuda:0')
tensor(7.4394e-09, device='cuda:0')
tensor(1.0279e-08, device='cuda:0')
tensor(8.8266e-09, device='cuda:0')
tensor(7.1581e-09, device='cuda:0')
tensor(6.9017e-09, device='cuda:0')
tensor(1.0015e-08, device='cuda:0')
tensor(1.0308e-08, device='cuda:0')
tensor(9.9976e-09, device='cuda:0')
tensor(7.9847e-09, device='cuda:0')
tensor(9.6934e-09, device='cuda:0')
tensor(8.9127e-09, device='cuda:0')
tensor(1.0082e-08, device='cuda:0')
tensor(8.3578e-09, device='c

tensor(9.2724e-09, device='cuda:0')
tensor(8.3444e-09, device='cuda:0')
tensor(8.2258e-09, device='cuda:0')
tensor(8.3463e-09, device='cuda:0')
tensor(1.1180e-08, device='cuda:0')
tensor(8.5166e-09, device='cuda:0')
tensor(9.0830e-09, device='cuda:0')
tensor(6.4693e-09, device='cuda:0')
tensor(7.9330e-09, device='cuda:0')
tensor(8.7194e-09, device='cuda:0')
tensor(1.1075e-08, device='cuda:0')
tensor(7.1198e-09, device='cuda:0')
tensor(1.0141e-08, device='cuda:0')
tensor(9.5537e-09, device='cuda:0')
tensor(1.0137e-08, device='cuda:0')
tensor(1.4601e-08, device='cuda:0')
tensor(9.3891e-09, device='cuda:0')
tensor(8.7061e-09, device='cuda:0')
tensor(9.3279e-09, device='cuda:0')
tensor(8.6716e-09, device='cuda:0')
tensor(9.0581e-09, device='cuda:0')
tensor(6.7333e-09, device='cuda:0')
tensor(1.0508e-08, device='cuda:0')
tensor(8.0689e-09, device='cuda:0')
tensor(9.8503e-09, device='cuda:0')
tensor(8.9452e-09, device='cuda:0')
tensor(9.2437e-09, device='cuda:0')
tensor(1.0870e-08, device='c