# Ejercicio 4

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
path_gz_data = '/content/drive/My Drive/prueba ejercicio 4/exercise_4' 

## 1. Cargando los datos a usar

En este apartado preparemos los datos que vamos a usar para entrenar nuestro modelo. El objetivo es conseguir:

* Obtener datos para entrenamiento -> txt con intervenciones
* Diccionario para transformar los caracteres usados en números -> json

In [None]:
import os
import pandas as pd
import json

from datetime import datetime

In [None]:
legislaturas = ['x', 'xi', 'xii']
name = 'Rajoy Brey'

In [None]:
path_train_data = 'data/train'
path_download = 'download'

for directory in [path_train_data, path_download]:
    if not os.path.exists(directory):
        os.makedirs(directory)

### 1.1 Obtener datos para entrenamiento -> txt con intervenciones

In [None]:
def get_politician_data(legistaturas_avaliables, politician_name):
    list_gz_data = []
    for legislatura in legistaturas_avaliables:
        gz_dir_path = f'{path_gz_data}/data/{legislatura}/data'
        data_to_append = [os.path.join(gz_dir_path, f) for f in os.listdir(gz_dir_path) if
                          os.path.isfile(os.path.join(gz_dir_path, f)) and '.gz' in f]

        list_gz_data += data_to_append

    data_joined = pd.read_pickle(list_gz_data[0])
    data_joined['date'] = datetime.strptime(list_gz_data[0].split('/')[-1][:-3], '%d_%m_%Y').date()

    for file in list_gz_data[1::]:
        df_aux = pd.read_pickle(file)
        df_aux['date'] = datetime.strptime(file.split('/')[-1][:-3], '%d_%m_%Y').date()
        data_joined = data_joined.append(df_aux, sort=False)
    #     print(len(data_joined))

    data_politician = data_joined[
        data_joined['name'].str.contains(politician_name) | data_joined['name'].str.contains(
            politician_name.upper())].sort_values(
        by='date').reset_index().drop(columns='index')
    print(len(data_politician))

    return data_politician

Elegimos nuestro político favorito y obtenemos el String con todas sus intervenciones juntas:

In [None]:
data_politician = get_politician_data(legislaturas, name)
text_politician = ''
for intervention in list(data_politician['text']):
    text_politician += intervention.replace('(Aplausos).', '').replace('(Rumores).', '').replace('Muchas gracias.',
                                                                                                 '').strip() + '\n'

Escribimos el resultado en un txt

In [None]:
# Create text file
text_file = open(f'{path_train_data}/{name}.txt', 'w')
text_file.write(text_politician)
text_file.close()

### 1.2 Diccionario para transformar los caracteres usados en números -> json

Crear un diccionario con:
    chars: lista del conjunto (set) de las letras y caracteres encontrados en el texto anterior.
    int2char: diccionario cuya clave es un enetro y el valor la letra correspondiente
    char2int: int2char al reves las claves son los valores y los valores las claves.
    
Por ejemplo si tenemos el texto para el entrenamiento "hola ¿qué tal?" el resultado sería el siguiente:
    
{'chars': ('a', 'é', '¿', 'h', 't', 'l', 'q', ' ', 'u', 'o', '?'), 'int2char': {0: 'a', 1: 'é', 2: '¿', 3: 'h', 4: 't', 5: 'l', 6: 'q', 7: ' ', 8: 'u', 9: 'o', 10: '?'}, 'char2int': {'a': 0, 'é': 1, '¿': 2, 'h': 3, 't': 4, 'l': 5, 'q': 6, ' ': 7, 'u': 8, 'o': 9, '?': 10} }


In [None]:
# TODO: Crear vocabulario

## 2. Creación y entrenamiento del modelo

In [None]:
import logging
import pandas as pd
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
import torch.utils.data as utils
from torch.autograd import Variable

import json


logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger('text_generator')

In [None]:
class GeneratorTextLSTM(nn.Module):
    def __init__(self, data_input, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        # TODO: Cargar las variables desde nuestro vocabulario (data_input)
        self.chars = 
        self.int2char = 
        self.char2int = 

        # define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers,
                            dropout=drop_prob, batch_first=True)

        # define a dropout layer
        self.dropout = nn.Dropout(drop_prob)

        # define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))

    def forward(self, x, hidden):
        """ Forward pass through the network.
            These inputs are x, and the hidden/cell state `hidden`. """

        # Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)

        # pass through a dropout layer
        out = self.dropout(r_output)

        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)

        # put x through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden

    def init_hidden(self, batch_size):
        """ Initializes hidden state """
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))

        return hidden

In [None]:
def load_data(path_file):
    """Function to load data to train the model

        Args:
            path_file (str): path of txt with text
        Returns:
            text_loaded (str): String with all text loaded
    """
    with open(path_file, 'r') as f:
        text_loaded = f.read()

    return text_loaded


def one_hot_encode(array_text, n_labels):
    """Function to load data to train the model

       Args:
           array_text (np.array): np.array with text converted in numbers
           n_labels (int): number of distinct chars in the text
       Returns:
           one_hot (np.array): data from array_text into values 0 to 1
    """
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*array_text.shape), n_labels), dtype=np.float32)

    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), array_text.flatten()] = 1.

    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*array_text.shape, n_labels))

    return one_hot


def get_batches(array_text, batch_size, seq_length):
    """Create a generator that returns batches of size batch_size x seq_length from array_text.

        Args:
            array_text: Array you want to make batches from
            batch_size: Batch size, the number of sequences per batch
            seq_length: Number of encoded chars in a sequence
    """

    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(array_text) // batch_size_total

    # Keep only enough characters to make full batches
    array_text = array_text[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    array_text = array_text.reshape((batch_size, -1))

    # iterate through the array, one sequence at a time
    for n in range(0, array_text.shape[1], seq_length):
        # The features
        x = array_text[:, n:n + seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], array_text[:, n + seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], array_text[:, 0]
        yield x, y



In [None]:
def validation(model, test_loader, criterion):
    accuracy = 0
    test_loss = 0
    for characteristics, labels in test_loader:
        characteristics = Variable(characteristics, requires_grad=True).to(device)
        labels = Variable(labels, requires_grad=False).to(device)

        output = model.forward(characteristics)
        test_loss += criterion(output, labels).item()

        # Calculating the accuracy
        ps = output
        equality = (labels.data == torch.round(ps))
        accuracy += equality.type_as(torch.FloatTensor()).mean()

    return test_loss, accuracy


def train(model, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    """Create a generator that returns batches of size batch_size x seq_length from array_text.

        Args:
            model: CharRNN network
            data: text data to train the network
            epochs: Number of epochs to train
            batch_size: Number of mini-sequences per mini-batch, aka batch size
            seq_length: Number of character steps per mini-batch
            lr: learning rate
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: Number of steps for printing training and validation loss
    """
    model.to(device)

    opt = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # create training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data, val_data = data[:val_idx], data[val_idx:]

    counter = 0
    n_chars = len(model.chars)

    valid_loss_min = np.Inf  # set initial "min" to infinity (initialize tracker for minimum validation loss)

    for e in range(epochs):
        # Model in training mode, dropout is on
        model.train()

        # initialize hidden state
        h = model.init_hidden(batch_size)

        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1

            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device)

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            model.zero_grad()

            # get the output from the model
            output, h = model(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size * seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = model.init_hidden(batch_size)
                val_losses = []
                model.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)

                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])

                    inputs, targets = x.to(device), y.to(device)

                    output, val_h = model(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size * seq_length).long())

                    val_losses.append(val_loss.item())

                valid_loss = np.mean(val_losses)

                if valid_loss <= valid_loss_min:
                    print('  > Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,
                                                                                                        valid_loss))
                    torch.save(model.state_dict(), f'{path_download}/text_generator_{epochs}_epoch.pt')
                    valid_loss_min = valid_loss

                model.train()  # reset to train mode after iteration through validation data

                print("Epoch: {}/{}...".format(e + 1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))


def predict(model, char, h=None, top_k=None):
    """ Given a character, predict the next character.
        Returns the predicted character and the hidden state.
    """
    # tensor inputs
    x = np.array([[model.char2int[char]]])
    x = one_hot_encode(x, len(model.chars))
    inputs = torch.from_numpy(x).to(device)

    # detach hidden state from history
    h = tuple([each.data for each in h])
    # get the output of the model
    out, h = model(inputs, h)

    # get the character probabilities
    p = F.softmax(out, dim=1).data.cpu()

    # get top characters
    if top_k is None:
        top_ch = np.arange(len(model.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    # select the likely next character with some element of randomness
    p = p.numpy().squeeze()
    char = np.random.choice(top_ch, p=p / p.sum())

    # return the encoded value of the predicted char and the hidden state
    return model.int2char[str(char)], h


def sample(model, size, prime='The', top_k=None):
    model.to(device)
    model.eval()  # eval mode

    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = model.init_hidden(1)
    for ch in prime:
        char, h = predict(model, ch, h, top_k=top_k)

    chars.append(char)

    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(model, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [None]:
def main(data_txt):
    logger.info('Cargando datos...')
    text_data = load_data(data_txt)
    logger.debug('Datos cargados de ' + str(data_txt) + ' ejemplo: ' + str(text_data[:300]) + '\n')

    logger.info('Cargando datos')
    with open(f'{path_download}/{name}_vocabulary.json', 'r') as js_input:
        data_input = json.load(js_input)
    logger.debug('Datos cargados -> ' + str(data_input))

    # TODO: encode the text -> Pasar texto a números utilizando nuestro char2int cargado en (data_input) 
    # guardarlo en un np.array
    encoded = 
    logger.info('Transformado texto en números -> ' + str(encoded[:200]) + '\n')

    logger.info('Inicializando modelo...')
    n_hidden = 512
    n_layers = 2

    model = GeneratorTextLSTM(data_input, n_hidden, n_layers)
    logger.info('Párametros del modelo:' + str(model) + '\n')

    logger.info('Empezando entrenamiento del modelo...')
    batch_size = 32
    seq_length = 100
    
    # TODO: definir echocs
    n_epochs = 

    train(model, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)
    logger.info('Fin de entrenamiento del modelo ;)')

    logger.info('Ejemplo de sample: \n')
    print('\n------------------------------------------\n')
    print(sample(model, 1000, prime='crisis económica ', top_k=5))
    print('\n------------------------------------------\n')
    print(sample(model, 1000, prime='maltrato infantil ', top_k=5))
    print('\n------------------------------------------\n')
    logger.info('FIN :)')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info('Ejecutando en -> ' + str(device) + '\n')

main(f'{path_train_data}/{name}.txt')