# 1. Librerías

[RUNME] Install Colab TPU compatible PyTorch/TPU wheels and dependencie

In [None]:
import collections
from datetime import datetime, timedelta
import os
import requests
import threading

_VersionConfig = collections.namedtuple('_VersionConfig', 'wheels,server')
VERSION = "xrt==1.15.0"  #@param ["xrt==1.15.0", "torch_xla==nightly"]
CONFIG = {
    'xrt==1.15.0': _VersionConfig('1.15', '1.15.0'),
    'torch_xla==nightly': _VersionConfig('nightly', 'XRT-dev{}'.format(
        (datetime.today() - timedelta(1)).strftime('%Y%m%d'))),
}[VERSION]
DIST_BUCKET = 'gs://tpu-pytorch/wheels'
TORCH_WHEEL = 'torch-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels)
TORCH_XLA_WHEEL = 'torch_xla-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels)
TORCHVISION_WHEEL = 'torchvision-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels)

# Update TPU XRT version
def update_server_xrt():
  print('Updating server-side XRT to {} ...'.format(CONFIG.server))
  url = 'http://{TPU_ADDRESS}:8475/requestversion/{XRT_VERSION}'.format(
      TPU_ADDRESS=os.environ['COLAB_TPU_ADDR'].split(':')[0],
      XRT_VERSION=CONFIG.server,
  )
  print('Done updating server-side XRT: {}'.format(requests.post(url)))

update = threading.Thread(target=update_server_xrt)
update.start()

# Install Colab TPU compat PyTorch/TPU wheels and dependencies
!pip uninstall -y torch torchvision
!gsutil cp "$DIST_BUCKET/$TORCH_WHEEL" .
!gsutil cp "$DIST_BUCKET/$TORCH_XLA_WHEEL" .
!gsutil cp "$DIST_BUCKET/$TORCHVISION_WHEEL" .
!pip install "$TORCH_WHEEL"
!pip install "$TORCH_XLA_WHEEL"
!pip install "$TORCHVISION_WHEEL"
!sudo apt-get install libomp5
update.join()

In [1]:
#importar proyecto desde github
!git clone https://github.com/Camiloez/nlp_project.git

In [None]:
#setear directorio de trabajo
import os
os.chdir('nlp_project/codigo_proyecto')

In [None]:
import os
import shutil
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch_xla
import torch_xla.core.xla_model as xm
from torch.utils.data import DataLoader
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable
from fastai.text import TextLMDataBunch, collections

from gcnn import GCNNmodel
from config import params
from GCNN_textfuncs import LMDataset_GCNN, SortishSampler_GCNN, SortSampler_GCNN, pad_collate_GCNN
from tools import make_paragraphs, make_tokens

In [None]:
device =  xm.xla_device()

In [3]:
params

{'epochs': 50,
 'emb_sz': 300,
 'k': 4,
 'nh': 600,
 'nl': 4,
 'downbot': 20,
 'batch_size': 20,
 'lr': 1,
 'mom': 0.95,
 'wd': 5e-05,
 'nesterov': True,
 'grad_clip': 0.07,
 'opttype': 'sgd',
 'use_gpu': True}

# 2. WikiText 103 dataset processing <a class="anchor" id="3"></a>

Leemos los archivos a partir de un .zip, el .zip cuenta con tres archivos planos, train, valid y test. De cada archivo se extraen los documentos y son guardados en una lista dentro de un diccionario.

In [None]:
#montar drive donde se halla el dataset y los embeddings
from google.colab import drive
drive.mount('/content/drive/')

In [4]:
#importar datasets y guardar en un diccionario
shutil.unpack_archive('../wikitext-103-v1.zip', extract_dir='dataset')
working_dir = os.path.join(os.getcwd(), 'dataset', 'wikitext-103')
wikitext_files = os.listdir(working_dir)

wiki = {}
for wikitext_file in wikitext_files:
    with open(os.path.join(working_dir, wikitext_file), encoding='utf-8') as data_file:
        name = wikitext_file.split('.')[1]
        corpus = []
        for index, line in enumerate(data_file):
            # filtrar lineas vacías y headers
            if len(line) < 3 or line[1] == '=':
                continue
            else:
                corpus.append(line.strip()+' </s>')#añadir end symbol
        #list of str: dataset no tokenizado        
        wiki[name] = corpus
shutil.rmtree('dataset')

In [5]:
#Exportar data como .csv
df_train = pd.DataFrame({'text':wiki['train']})
df_valid = pd.DataFrame({'text':wiki['valid']})
df_test = pd.DataFrame({'text':wiki['test']})

#Etiquetas se agregan para cargar datos despues
df_train['labels'] = 0
df_valid['labels'] = 0
df_test['labels'] = 0

df_train = df_train[['labels','text']]
df_valid = df_valid[['labels','text']]
df_test = df_test[['labels','text']]

df_train.to_csv('train.csv', header=False, index=False, sep='|')
df_valid.to_csv('valid.csv', header=False, index=False, sep='|')
df_test.to_csv('test.csv', header=False, index=False, sep='|')

#Liberar RAM
del wiki, df_train, df_valid, df_test

## Cargar datos con fastai.text

In [6]:
data_lm = TextLMDataBunch.from_csv(path = '', delimiter = '|', csv_name='train.csv', test='test.csv', 
          max_vocab=300000, min_freq=0)
itos=data_lm.train_ds.vocab.itos
vocab_size=len(itos)

In [7]:
train_tokens=[data_lm.train_ds[i][0].data for i in range(len(data_lm.train_ds))]
test_tokens=[data_lm.valid_ds[i][0].data for i in range(len(data_lm.valid_ds))]

train_dataset = LMDataset_GCNN(train_tokens)
test_dataset = LMDataset_GCNN(test_tokens)

## Crear DataLoader y Samplers

Los siguientes pasos son muy importantes para poder entrenar el modelo.

- El Sampler se encarga de generar las muestras de manera correcta en el DataLoader
- El DataLoader se encarga de cargar bien los datos para poder manejarlos en memoria


In [8]:
#Samplers
train_sampler = SortishSampler_GCNN(data_length = len(train_dataset),key=lambda x:len(train_dataset[x][0]), bs = params['batch_size'])
test_sampler = SortSampler_GCNN(test_dataset,key = lambda x:len(test_dataset[x][0]))

#DataLoaders
train_loader = DataLoader(train_dataset, batch_size = params['batch_size'], collate_fn=pad_collate_GCNN, sampler = train_sampler, pin_memory= False)
test_loader = DataLoader(test_dataset, batch_size= params['batch_size'], collate_fn=pad_collate_GCNN, sampler = test_sampler, pin_memory= False)

# Embeddings

Cargamos los embeddings

In [9]:
def make_embeddings(path, itos, embedding_size):
    """ A partir del path de un .txt con embeddings genera un vector numpy de embeddings
    """
    words = []
    word2idx = {}
    vectors = []
    idx = 0
    with open(path, 'rb') as file:
        for line in file:
            line = line.decode().split()
            word = line[0]
            words.append(word)
            word2idx[word] = idx
            idx += 1
            vectors.append(line[1:])
    
    #Verificamos que tengamos misma cantidad de palabras y vectores
    assert len(words) == len(vectors)
    
    #Generamos diccionario
    temporal_dictionary = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(words)})
    unk_row = vectors[-1] # vector default para <unk>
    #Creamos vector de tamaño número de palabras, por tamaño de embedding
    embeddings = np.zeros((len(itos), embedding_size), dtype=np.float32)
    
    #Vamos iterando por palabras del diccionario que creamos antes para generar vector de embeddings con la misma
    #correspondencia de indices
    for i, word in enumerate(itos):
        index = temporal_dictionary[word] #indice de palabra en diccionario temporal de embeddings cargados
        #Generamos el vector de embeddings 
        embeddings[i] = vectors[index] if index>=0 else unk_row #Si no lo encuentra asigna vector por defecto 
        
    return embeddings
        

In [10]:
path = 'glove.6B.300d.txt'
embeddings = make_embeddings(path, itos, 300)

# Model
Primero generamos el modelo, los párametros están en config.py

In [11]:
GCNN = GCNNmodel(vocab_size, params['emb_sz'], params['k'], params['nh'], params['nl'], params['downbot'])
GCNN.model.embed.weight.data = torch.FloatTensor(embeddings).to(xm.xla_device())

In [12]:
GCNN

GCNNmodel(
  (embed): Embedding(280072, 300)
  (inlayer): GLUblock(
    (convresid): Conv2d(300, 600, kernel_size=(1, 1), stride=(1, 1))
    (leftpad): ConstantPad2d(padding=(0, 0, 3, 0), value=0)
    (convx1a): Conv2d(300, 15, kernel_size=(1, 1), stride=(1, 1))
    (convx2a): Conv2d(300, 15, kernel_size=(1, 1), stride=(1, 1))
    (convx1b): Conv2d(15, 15, kernel_size=(4, 1), stride=(1, 1))
    (convx2b): Conv2d(15, 15, kernel_size=(4, 1), stride=(1, 1))
    (convx1c): Conv2d(15, 600, kernel_size=(1, 1), stride=(1, 1))
    (convx2c): Conv2d(15, 600, kernel_size=(1, 1), stride=(1, 1))
  )
  (GLUlayers): Sequential(
    (0): GLUblock(
      (convresid): Conv2d(600, 600, kernel_size=(1, 1), stride=(1, 1))
      (leftpad): ConstantPad2d(padding=(0, 0, 3, 0), value=0)
      (convx1a): Conv2d(600, 30, kernel_size=(1, 1), stride=(1, 1))
      (convx2a): Conv2d(600, 30, kernel_size=(1, 1), stride=(1, 1))
      (convx1b): Conv2d(30, 30, kernel_size=(4, 1), stride=(1, 1))
      (convx2b): Conv2d

## Entrenar

In [None]:
#Escribir output en un .txt
with open('/content/drive/My Drive/data/log.txt', 'w') as f:
    start_time = time.time()
    train_loss_list = []; test_loss_list = []
    for epoch in range(params['epochs']):
        ti = time.time()
        for batch_idx, (data, target) in enumerate(train_loader):
        
            #Optimizer
            if params['opttype'] == 'sgd':
                optimizer = torch.optim.SGD(GCNN.parameters(), lr = params['lr'], momentum=params['mom'], weight_decay=params['wd'],
                                    nesterov= params['nesterov'])
            elif params['opttype'] == 'adam':
                optimizer = torch.optim.Adam(GCNN.parameters(), lr = params['lr'], betas = (params['mom'], 0.999))

            #Dejar los gradientes del optimizador en 0
            optimizer.zero_grad()

            #Forward pass
            output = GCNN(data.to(device))

            #Actualizar párametros
            loss = output.loss
            loss.backward()

            train_loss_list.append(loss.item())

            #Gradient clipping
            if params['grad_clip'] != 0:
                nn.utils.clip_grad_value_(GCNN.parameters(), params['grad_clip'])

            #Actualizar pesos
            xm.optimizer_step(optimizer, barrier=True)

            if batch_idx % 10000 == 0:
                elapsed_time=time.time()-start_time
                batch_info = 'Epoch: {}  Batches: {}  Loss: {} Time: {}s'.format(epoch, batch_idx, loss.item(), elapsed_time)
                print(batch_info)
                f.write(batch_info+'\n')

        #Verificamos ahora en el test set
        val_loss=[]

        #Gradientes no se actualizan y modelo en modo evaluacion
        with torch.no_grad():
            GCNN.eval()

            for batch_idx, (data, target) in enumerate(test_loader):
                optimizer.zero_grad()

                #Forward
                output = GCNN(data.to(device))
                loss = output.loss
                output.output
                val_loss.append(loss.data.item())

        #Modelo en modo entrenamiento
        GCNN.train().to(device)
        ave_val_loss=sum(val_loss) / len(val_loss)
        val_update_string='Validation Loss: {:.4f}\tPerp: {:.4f}'.format(ave_val_loss,np.exp(ave_val_loss))
        print(val_update_string+'\n')
        f.write(val_update_string)

    end_time = time.time()
    total_time = f'Total time: {end_time-start_time}s'
    f.write(total_time)