In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable
import numpy as np
from gcnn import GCNNmodel
from config import params
from GCNN_textfuncs import LMDataset_GCNN, SortishSampler_GCNN, SortSampler_GCNN, pad_collate_GCNN
from fastai.text import TextLMDataBunch
import time
import torch.nn as nn

from tools import make_paragraphs, make_tokens

In [3]:
params

{'epochs': 50,
 'emb_sz': 300,
 'k': 4,
 'nh': 600,
 'nl': 4,
 'downbot': 20,
 'batch_size': 20,
 'lr': 1,
 'mom': 0.95,
 'wd': 5e-05,
 'nesterov': True,
 'grad_clip': 0.07,
 'opttype': 'sgd',
 'use_gpu': True}

# Make data

In [4]:
DATAPATH = 'data/GCNN/'

#Preprocesamiento

#Leer data, notar que so ntextos largos
df_train = pd.read_csv(DATAPATH+'train.csv',header=None,names=['text'])
df_test = pd.read_csv(DATAPATH+'test.csv',header=None,names=['text'])

# textos

text_train = df_train.text.values
test_train = df_test.text.values


#Eliminar dataframes de memoria
try:
    del df_train, df_test
except:
    pass

## Se realizan párrafos para el entrenamiento

- Se corta el texto por saltos \n
- Se eliminan párrafos con largo menor a min_len y mayor a max_len
- Se agrega tag 'EOS' al final

In [5]:
train_paragraphs = make_paragraphs(text_train)
test_paragraphs = make_paragraphs(test_train)

In [6]:
train_paragraphs[:5]

['= = = = 2011 = = = =EOS',
 '= = = = 2012 = = = =EOS',
 '= = = = 2013 = = = =EOS',
 '= = = = 2015 = = = =EOS',
 '= = = = 2006 = = = =EOS']

In [7]:
df_train_par = pd.DataFrame({'text':train_paragraphs})
df_test_par = pd.DataFrame({'text':test_paragraphs})

#Etiquetas se agregan para cargar datos despues
df_train_par['labels'] = 0
df_test_par['labels'] = 0

df_train_par = df_train_par[['labels','text']]
df_test_par = df_test_par[['labels','text']]

In [8]:
df_train_par.to_csv('train_proc_par2.csv', header=False, index=False)
df_test_par.to_csv('test_proc_par2.csv', header=False, index=False)

In [9]:
df_train_par.shape

(17411, 2)

In [10]:
df_test_par.shape

(1864, 2)

# Cargar datos con fastai.text

In [11]:
data_lm = TextLMDataBunch.from_csv(path = '', csv_name='train_proc_par2.csv', test='test_proc_par2.csv')
itos=data_lm.train_ds.vocab.itos# the vocab
vs=len(itos)# vs is the length of the vocab

In [12]:
train_tokens=[data_lm.train_ds[i][0].data for i in range(len(data_lm.train_ds))]
test_tokens=[data_lm.valid_ds[i][0].data for i in range(len(data_lm.valid_ds))]

In [13]:
train_dataset = LMDataset_GCNN(train_tokens)
test_dataset = LMDataset_GCNN(test_tokens)

## Crear DataLoader y Samplers

Los siguientes pasos son muy importantes para poder entrenar el modelo.

- El Sampler se encarga de generar las muestras de manera correcta en el DataLoader
- El DataLoader se encarga de cargar bien los datos para poder manejarlos en memoria


In [14]:
#Samplers
train_sampler = SortishSampler_GCNN(data_length = len(train_dataset),key=lambda x:len(train_dataset[x][0]), bs = params['batch_size'])
test_sampler = SortSampler_GCNN(test_dataset,key = lambda x:len(test_dataset[x][0]))

#DataLoaders
train_loader = DataLoader(train_dataset, batch_size = params['batch_size'], collate_fn=pad_collate_GCNN, sampler = train_sampler, pin_memory= False)
test_loader = DataLoader(test_dataset, batch_size= params['batch_size'], collate_fn=pad_collate_GCNN, sampler = test_sampler, pin_memory= False)

# Embeddings

In [16]:
from fastai.text import collections

def make_embeddings(path, itos, embedding_size):
    """ A partir del path de un .txt con embeddings genera un vector numpy de embeddings
    """
    words = []
    word2idx = {}
    vectors = []
    idx = 0
    with open(path, 'rb') as file:
        for line in file:
            line = line.decode().split()
            word = line[0]
            words.append(word)
            word2idx[word] = idx
            idx += 1
            vectors.append(line[1:])
    
    #Verificamos que tengamos misma cantidad de palabras y vectores
    assert len(words) == len(vectors)
    
    #Generamos diccionario
    temporal_dictionary = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(words)})
    unk_row = vectors[-1] # vector default para <unk>
    #Creamos vector de tamaño número de palabras, por tamaño de embedding
    embeddings = np.zeros((len(itos), embedding_size), dtype=np.float32)
    
    #Vamos iterando por palabras del diccionario que creamos antes para generar vector de embeddings con la misma
    #correspondencia de indices
    for i, word in enumerate(itos):
        index = temporal_dictionary[word] #indice de palabra en diccionario temporal de embeddings cargados
        #Generamos el vector de embeddings 
        embeddings[i] = vectors[index] if index>=0 else unk_row #Si no lo encuentra asigna vector por defecto 
        
    return embeddings
        

In [17]:
path = 'data/GCNN/glove.6B.300d.txt'
embeddings = make_embeddings(path, itos, 300)

### Revisar embeddings

- Notemos que el largo del diccionario que creamos es igual al número de filas del vector de embeddings
- Se pueden hacer pruebas filtrando más o menos palabras con gensim dictionary

In [18]:
len(itos), embeddings.shape

(28160, (28160, 300))

In [21]:
vocab_size = len(itos)

In [19]:
#np.save('embeddings.npy', embeddings) #Guardar embeddings

# Model
Primero generamos el modelo, los párametros están en config.py

In [22]:
GCNN = GCNNmodel(vocab_size, params['emb_sz'], params['k'], params['nh'], params['nl'], params['downbot'])
#Si se quiere usar GPU hay que poner el modelo en GPU con .cuda()
if params['use_gpu']:
    GCNN.cuda()
    #Setear los embeddings de glove en la capa de embeddings
    GCNN.embed.weight.data = torch.FloatTensor(embeddings).cuda()
else:
    GCNN.model.embed.weight.data = torch.FloatTensor(embeddings)

In [23]:
GCNN

GCNNmodel(
  (embed): Embedding(28160, 300)
  (inlayer): GLUblock(
    (convresid): Conv2d(300, 600, kernel_size=(1, 1), stride=(1, 1))
    (leftpad): ConstantPad2d(padding=(0, 0, 3, 0), value=0)
    (convx1a): Conv2d(300, 15, kernel_size=(1, 1), stride=(1, 1))
    (convx2a): Conv2d(300, 15, kernel_size=(1, 1), stride=(1, 1))
    (convx1b): Conv2d(15, 15, kernel_size=(4, 1), stride=(1, 1))
    (convx2b): Conv2d(15, 15, kernel_size=(4, 1), stride=(1, 1))
    (convx1c): Conv2d(15, 600, kernel_size=(1, 1), stride=(1, 1))
    (convx2c): Conv2d(15, 600, kernel_size=(1, 1), stride=(1, 1))
  )
  (GLUlayers): Sequential(
    (0): GLUblock(
      (convresid): Conv2d(600, 600, kernel_size=(1, 1), stride=(1, 1))
      (leftpad): ConstantPad2d(padding=(0, 0, 3, 0), value=0)
      (convx1a): Conv2d(600, 30, kernel_size=(1, 1), stride=(1, 1))
      (convx2a): Conv2d(600, 30, kernel_size=(1, 1), stride=(1, 1))
      (convx1b): Conv2d(30, 30, kernel_size=(4, 1), stride=(1, 1))
      (convx2b): Conv2d(

## Entrenar

In [None]:
start_time = time.time()

train_loss_list = []; test_loss_list = []
for epoch in range(params['epochs']):
    for batch_idx, (data, target) in enumerate(train_loader):
        #Si queremos usar GPU hay que pasar los datos a GPU
        if params['use_gpu']:
            data.cuda()
            
        #Optimizer
        if params['opttype'] == 'sgd':
            optimizer = torch.optim.SGD(GCNN.parameters(), lr = params['lr'], momentum=params['mom'], weight_decay=params['wd'],
                                       nesterov= params['nesterov'])
        elif params['opttype'] == 'adam':
            optimizer = torch.optim.Adam(GCNN.parameters(), lr = params['lr'], betas = (params['mom'], 0.999))
        
        #Dejar los gradientes del optimizador en 0
        optimizer.zero_grad()
        
        
        #Forward pass
        output = GCNN(data)
        
        #Actualizar párametros
        loss = output.loss
        loss.backward()
        
        train_loss_list.append(loss.item())
        
        #Gradient clipping
        if params['grad_clip'] != 0:
            nn.utils.clip_grad_value_(GCNN.parameters(), params['grad_clip'])
            
        #Actualizar pesos
        optimizer.step()
        
        if batch_idx % 100 == 0:
            elapsed_time=time.time()-start_time
            print('Epoch: {}  Batches: {}  Loss: {}'.format(epoch, batch_idx, loss.item()))
    
    #Verificamos ahora en el test set
    val_loss=[]
    
    #Gradientes no se actualizan y modelo en modo evaluacion
    with torch.no_grad():
        GCNN.eval()
        
        for batch_idx, (data, target) in enumerate(test_loader):
            optimizer.zero_grad()
            
            #Forward
            output = GCNN(data)
            loss = output.loss
            output.output
            val_loss.append(loss.data.item())
        
    
    #Modelo en modo entrenamiento
    GCNN.train()
    ave_val_loss=sum(val_loss) / len(val_loss)
    val_update_string='Validation Loss: {:.4f}\tPerp: {:.4f}'.format(ave_val_loss,np.exp(ave_val_loss))
    print(val_update_string)


Epoch: 0  Batches: 0  Loss: 6.711717128753662
Epoch: 0  Batches: 100  Loss: 7.617319583892822
Epoch: 0  Batches: 200  Loss: 6.950686931610107
Epoch: 0  Batches: 300  Loss: 7.048349857330322
Epoch: 0  Batches: 400  Loss: 3.3203938007354736
Epoch: 0  Batches: 500  Loss: 5.501758575439453
Epoch: 0  Batches: 600  Loss: 4.091529846191406
Validation Loss: 6.6916	Perp: 805.6401
Epoch: 1  Batches: 0  Loss: 5.927438259124756
Epoch: 1  Batches: 100  Loss: 2.3362200260162354
Epoch: 1  Batches: 200  Loss: 5.40895938873291
Epoch: 1  Batches: 300  Loss: 4.105952262878418
Epoch: 1  Batches: 400  Loss: 5.823309898376465
