<a href="https://colab.research.google.com/github/AleDella/NLUProject/blob/main/NLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dataset Functions**

In [None]:
# Download and unzip the dataset
!wget https://data.deepai.org/ptbdataset.zip
!unzip ptbdataset.zip -d data

--2021-05-29 17:11:53--  https://data.deepai.org/ptbdataset.zip
Resolving data.deepai.org (data.deepai.org)... 138.201.36.183
Connecting to data.deepai.org (data.deepai.org)|138.201.36.183|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4568253 (4.4M) [application/x-zip-compressed]
Saving to: ‘ptbdataset.zip’


2021-05-29 17:11:54 (6.88 MB/s) - ‘ptbdataset.zip’ saved [4568253/4568253]

Archive:  ptbdataset.zip
  inflating: data/README             
  inflating: data/ptb.char.test.txt  
  inflating: data/ptb.char.train.txt  
  inflating: data/ptb.char.valid.txt  
  inflating: data/ptb.test.txt       
  inflating: data/ptb.train.txt      
  inflating: data/ptb.valid.txt      


In [None]:
# The file imported here is an external file written by TensorFlow authors
# it is subject to Apache Licence Version 2.0
# In order to check the whole file go here https://github.com/deeplearningathome/pytorch-language-model/blob/5a0f888560ec6adfb366080f8f874f18b06caf14/reader.py
# NOTE: in the cell after this there is the whole reader.py file updated for the current torch version.
!mkdir data
!wget -q -O data/ptb.zip https://ibm.box.com/shared/static/z2yvmhbskc45xd2a9a4kkn6hg4g4kj5r.zip
!unzip -o data/ptb.zip -d data
!cp data/ptb/reader.py .

mkdir: cannot create directory ‘data’: File exists
Archive:  data/ptb.zip
   creating: data/ptb/
  inflating: data/ptb/reader.py      
   creating: data/__MACOSX/
   creating: data/__MACOSX/ptb/
  inflating: data/__MACOSX/ptb/._reader.py  
  inflating: data/__MACOSX/._ptb     


In [None]:
# Reader.py
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os

import numpy as np
import tensorflow as tf

# Read the whole set and replaces the "\n" with "<eos>" tag
def _read_words(filename):
  with tf.io.gfile.GFile(filename, "r") as f:
    return f.read().replace("\n", "<eos>").split()

# Builds the dictionaries for the words in the dataset
# Associate ad each word an index
def _build_vocab(filename):
  data = _read_words(filename)

  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

  words, _ = list(zip(*count_pairs))
  word_to_id = dict(zip(words, range(len(words))))
  id_to_word = dict((v, k) for k, v in word_to_id.items())

  return word_to_id, id_to_word

# Convert a whole file into indeces
def _file_to_word_ids(filename, word_to_id):
  data = _read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]

# Get the raw PTB dataset data
def ptb_raw_data(data_path=None, prefix="ptb"):
  """Load PTB raw data from data directory "data_path".
  Reads PTB text files, converts strings to integer ids,
  and performs mini-batching of the inputs.
  The PTB dataset comes from Tomas Mikolov's webpage:
  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
  Args:
    data_path: string path to the directory where simple-examples.tgz has
      been extracted.
  Returns:
    tuple (train_data, valid_data, test_data, vocabulary)
    where each of the data objects can be passed to PTBIterator.
  """

  train_path = os.path.join(data_path, prefix + ".train.txt")
  valid_path = os.path.join(data_path, prefix + ".valid.txt")
  test_path = os.path.join(data_path, prefix + ".test.txt")

  word_to_id, id_2_word = _build_vocab(train_path)
  train_data = _file_to_word_ids(train_path, word_to_id)
  valid_data = _file_to_word_ids(valid_path, word_to_id)
  test_data = _file_to_word_ids(test_path, word_to_id)
  return train_data, valid_data, test_data, word_to_id, id_2_word

# Creates an iterator over PTB data
# This is like creating a DataLoader with Pytorch, but I used this because more
# convenient from a data-managing point of view (easier manipulation of data contained in the dataset)
def ptb_iterator(raw_data, batch_size, num_steps):
  """Iterate on the raw PTB data.
  This generates batch_size pointers into the raw PTB data, and allows
  minibatch iteration along these pointers.
  Args:
    raw_data: one of the raw data outputs from ptb_raw_data.
    batch_size: int, the batch size.
    num_steps: int, the number of unrolls.
  Yields:
    Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
    The second element of the tuple is the same data time-shifted to the
    right by one.
  Raises:
    ValueError: if batch_size or num_steps are too high.
  """
  raw_data = np.array(raw_data, dtype=np.int32)

  data_len = len(raw_data)
  batch_len = data_len // batch_size
  data = np.zeros([batch_size, batch_len], dtype=np.int32)
  for i in range(batch_size):
    data[i] = raw_data[batch_len * i:batch_len * (i + 1)]


  epoch_size = (batch_len - 1) // num_steps

  if epoch_size == 0:
    raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

  for i in range(epoch_size):
    x = data[:, i*num_steps:(i+1)*num_steps]
    y = data[:, i*num_steps+1:(i+1)*num_steps+1]
    yield (x, y)

In [None]:
# Get raw datas
raw_data = ptb_raw_data("data/")
# Get sets and vocabs
train_data, valid_data, test_data, vocab, word_to_id = raw_data

# **Model**

In [None]:
import torch.nn as nn
# Class for the whole Model
class PTBLstm(nn.Module):

    def __init__(self, vocab_size, emb_size, hidden_size, num_layers, drop_prob):
        super(PTBLstm, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.emb_size = emb_size
        # Embedding
        self.embedding = nn.Embedding(vocab_size, emb_size)
        # LSTM layer
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidden_size, num_layers=num_layers, dropout=drop_prob, batch_first = True)
        # Dropout layer
        self.drp = nn.Dropout(drop_prob)
        # Output layer
        self.l_out = nn.Linear(in_features=hidden_size, out_features=vocab_size)
        self.init_weights()
        
    def forward(self, inputs, hidden):
        # Embedding
        x = self.embedding(inputs)
        # RNN returns output and last hidden state
        outpts, (h, c) = self.lstm(x, hidden)
        # Flatten output for feed-forward layer
        outpts = outpts.reshape(-1, self.lstm.hidden_size)
        # Output layer
        outpts = self.l_out(self.drp(outpts))
        return outpts, hidden
    # Initialize the hidden state in each epoch
    def init_state(self, batch_size, device):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device), weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device))
        return hidden
    # Initialize the weights of the network when creating it
    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.l_out.weight)
        nn.init.uniform_(self.l_out.weight, -initrange, initrange)

In [None]:
# Function that repackages the hidden state.
# This is crucial in order to detach the hidden state
# from previous gradient history so that torch doesn't have
# to check the gradients of the whole dataset each time.
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

# **Routines**

In [None]:
# Training function
def train(net, train_data, batch_size, num_steps, criterion, device):
  # Total loss of the epoch
  epoch_training_loss = 0
  # Total steps of the training (corresponds to the number of batches)
  train_steps = 0
  # Network in train-mode
  net.train()
  # Optimizer (Adam with weight decay)
  optimizer = torch.optim.AdamW(net.parameters(), lr=lr)
  # Initialize the hidden state
  hidden = net.init_state(batch_size, device)
  for x, y in ptb_iterator(train_data, batch_size, num_steps):
        # Repackage in order to detach it from previous grad history
        # Convert input to tensor
        x = torch.Tensor(x)
        # Convert target to tensor
        y = torch.LongTensor(y)
        # Transfer tensors to device
        x = x.to(device)
        y = y.to(device)
        # Zero grad
        net.zero_grad()
        # Repackaging
        hidden = repackage_hidden(hidden)
        # Forward pass
        outputs, hidden = net.forward(x.long(), hidden)
        # Compute loss
        loss = criterion(outputs, y.view(-1))
        # Backward pass
        loss.backward()
        # Normalize
        torch.nn.utils.clip_grad_norm_(net.parameters(), 0.25)
        # Optimization step
        optimizer.step()
        # Update loss
        epoch_training_loss += loss.item()
        # # Update steps counter
        train_steps += 1
  print("Training Loss: {}\t Training Perplexity: {}".format(epoch_training_loss/train_steps, np.exp(epoch_training_loss/train_steps)))

In [None]:
# Function for evaluation
def evaluate(net, valid_data, batch_size, num_steps, criterion, device):
    # Total loss of the epoch
    epoch_validation_loss = 0
    # Total steps of the validation (corresponds to the number of batches)
    val_steps = 0

    hidden = net.init_state(batch_size, device)
    # Network evaluation mode
    net.eval()
    
    for x, y in ptb_iterator(valid_data, batch_size, num_steps):
        # Repackage in order to detach it from previous grad history
        hidden = repackage_hidden(hidden)
        # Convert input to tensor
        x = torch.Tensor(x)
        # Convert target to tensor
        y = torch.LongTensor(y)
        # Tensors to device
        x = x.to(device)
        y = y.to(device)
        # Forward pass
        net.zero_grad()
        outputs, hidden = net.forward(x.long(), hidden)
        # Compute loss
        loss = criterion(outputs, y.view(-1))
        # Update loss
        epoch_validation_loss += loss.item()
        # Update steps counter
        val_steps += 1
    return epoch_validation_loss/val_steps

# **Training**

In [None]:
import torch
# Define Hyperparameters
batch_size = 64
# Number of unfoldings
num_steps = 20
epochs = 10
# Embedding vector size
emb_size = 650
hidden_size = 650
num_layers = 2
# Dropout probability
drop_prob = 0.5
# Learning rate
lr = 0.001
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device used:", device)

Device used: cuda:0


In [None]:
# Creation of the network
net = PTBLstm(len(vocab), emb_size, hidden_size, num_layers, drop_prob)
net.to(device)

PTBLstm(
  (embedding): Embedding(10000, 650)
  (lstm): LSTM(650, 650, num_layers=2, batch_first=True, dropout=0.5)
  (drp): Dropout(p=0.5, inplace=False)
  (l_out): Linear(in_features=650, out_features=10000, bias=True)
)

In [None]:
# Criterion for the loss
criterion = nn.CrossEntropyLoss()

In [None]:
# Training
for i in range(epochs):
  train(net, train_data, batch_size, num_steps, criterion, device)
  loss = evaluate(net, valid_data, batch_size, num_steps, criterion, device)
  print("Epoch: {}\tValidation Loss: {}\tValidation Perplexity: {}".format(i, loss, np.exp(loss)))
# Save the model and download it on your computer
torch.save(net.state_dict(), 'model.obj')
from google.colab import files
files.download('model.obj') 
print("おわりです :)")

Training Loss: 5.947496254582051	 Training Perplexity: 382.7937202187547
Epoch: 0	Validation Loss: 5.419042729494865	Validation Perplexity: 225.6629985413868
Training Loss: 5.386480979682985	 Training Perplexity: 218.4333600489644
Epoch: 1	Validation Loss: 5.1982702706989485	Validation Perplexity: 180.9589609905856
Training Loss: 5.182423074054981	 Training Perplexity: 178.11387163772352
Epoch: 2	Validation Loss: 5.082651849378619	Validation Perplexity: 161.20097031506222
Training Loss: 5.046653003403635	 Training Perplexity: 155.50113076657712
Epoch: 3	Validation Loss: 5.007829657772131	Validation Perplexity: 149.57974436928626
Training Loss: 4.94117413013763	 Training Perplexity: 139.93445440241834
Epoch: 4	Validation Loss: 4.954683370757521	Validation Perplexity: 141.8376893021859
Training Loss: 4.854567114643486	 Training Perplexity: 128.3251291203103
Epoch: 5	Validation Loss: 4.916038931461802	Validation Perplexity: 136.461009817906
Training Loss: 4.778748434108808	 Training Perpl

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

おわりです :)


# **Testing**

In [None]:
# Test the network
loss = evaluate(net, test_data, batch_size, num_steps, criterion, device)
print(f'Testing loss: {loss}, Testing perplex: {np.exp(loss)}')

Testing loss: 4.795990131795406, Testing perplex: 121.0241523401952
