In [0]:
# Author: Arman Kabiri
# Date: Feb. 18, 2020
# Email: Arman.Kabiri94@gmail.com

In [2]:
from google.colab import drive
drive.mount('/gdrive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [3]:
import os
os.chdir('/gdrive/My Drive/NLP_Stuff/My_Language_Model')
!pwd

/gdrive/My Drive/NLP_Stuff/My_Language_Model


In [0]:
import argparse
import math
import os.path as path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

from CorpusReader import CorpusReader
from Dictionary import Dictionary
from EmbeddingsLoader import EmbeddingsLoader
from Lang_Model import LanguageModel

In [0]:
class Args:
  corpus_train_file='Data/corpus-test.txt'
  corpus_valid_file=''
  embeddings_file='Data/English_Wiki_1Billion_embeddings.bin'
  output_model_path='Data/model.bin'
  n_layers=2
  hidden_size=300
  dropout_probablity=.25
  embeddings_dim=300
  batch_size=50
  seq_len=20
  epochs=2
  lr=0.001
  seed=120
  bidirectional_model=False
  tie_weights=False
  freez_embeddings=False
  gpu=True
  
args = Args()

In [0]:
torch.cuda.is_available()

True

In [0]:
def main():
    torch.set_num_threads(8)

    if torch.cuda.is_available():
        if not args.gpu:
            print("WARNING: You have a CUDA device, so you should probably run with --gpu")
    else:
        if args.gpu:
            print("You do not have a GPU device, so you should run CPU without --gpu option.")
            exit()

    torch.manual_seed(args.seed)
    corpus_train_reader = CorpusReader(args.corpus_train_file, 10000000)  # 100MB

    print("Generating Dictionaries")
    dictionary = Dictionary(corpus_train_reader)
    dictionary.build_dictionary()

    print("Loading Embeddings")

    embeddings_matrix = None
    if args.embeddings_file is not None:
        emb_loader = EmbeddingsLoader()
        embeddings_matrix = emb_loader.get_embeddings_matrix(args.embeddings_file, dictionary, args.embeddings_dim)

    model = LanguageModel(n_layers=args.n_layers, hidden_size=args.hidden_size, n_vocab=dictionary.get_dic_size(),
                          input_size=args.embeddings_dim, dropout=args.dropout_probablity,
                          bidirectional=args.bidirectional_model, pret_emb_matrix=embeddings_matrix,
                          freez_emb=args.freez_embeddings, tie_weights=args.tie_weights, use_gpu=args.gpu)

    ###############
    total_param = []
    for p in model.parameters():
        total_param.append(int(p.numel()))
    print(total_param)
    print(sum(total_param))
    ###############

    if path.exists(args.output_model_path):
        model.load_state_dict(torch.load(args.output_model_path))

    else:
        # put it into train mode.
        model.train()
        if args.gpu:
            model.cuda()

        # Optimizer and Loss
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        criterion = nn.CrossEntropyLoss()

        print("Training starts ...")
        for i in range(args.epochs):
            print(f"Epoch {i + 1}:")
            train(corpus_train_reader, dictionary, model, optimizer, criterion, args)

        print("Saving Model...")
        torch.save(model.state_dict(), args.output_model_path)

    # Text Generation:
    generate_text(model, dictionary, 'Cat'.lower(), 7)


def train(corpus_train_reader, dictionary, model, optimizer, criterion, args):
    batch_generator = corpus_train_reader.batchify(dictionary, args.batch_size, args.seq_len)
    hidden = model.init_hidden(args.batch_size)

    step = 0
    for x, y in tqdm(batch_generator):

        step += 1
        x = torch.from_numpy(x)
        y = torch.from_numpy(y)

        if args.gpu:
            x = x.cuda()
            y = y.cuda()

        hidden = detach_hidden(hidden)
        model.zero_grad()

        y_hat, hidden = model.forward(x, hidden)

        loss = criterion.forward(y_hat.view(-1, dictionary.get_dic_size()),
                                 y.reshape(args.batch_size * args.seq_len).long())
        loss.backward()

        # TODO: POSSIBLE EXPLODING GRADIENT PROBLEM! -> CLIP JUST IN CASE :
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)

        optimizer.step()

        if step % 100 == 0:
            print(f"Step {step},     Loss = {loss.item()},    PPL = {math.exp(loss)}")


def detach_hidden(hidden: tuple):

    return tuple(v.detach() for v in hidden)


def generate_text(model: LanguageModel, dictionary: Dictionary, seed: str, k=1):

    if args.gpu:
        model.cuda()
    else:
        model.cpu()

    model.eval()

    with torch.no_grad():
        hidden = model.init_hidden(1)
        input_text = seed
        output = [seed]

        for i in range(10):
            word, hidden = predict_next_word(model, dictionary, hidden, input_text, k)
            output.append(word)
            input_text = word

    print(' '.join(output))


def predict_next_word(model: LanguageModel, dictionary, hidden, input_text: str, k=1) -> tuple:
    input_tensor = dictionary.encode_text(input_text)
    input_tensor = np.array(input_tensor)
    input_tensor = torch.from_numpy(input_tensor)
    if args.gpu:
        input_tensor = input_tensor.cuda()

    input_tensor = input_tensor.view(-1,1)
    output, hidden = model.forward(input_tensor, hidden)
    # TODO here
    probs = F.softmax(output, 2)

    # move back to CPU to use with numpy
    if args.gpu:
        probs = probs.cpu()

    probs, picked_indexes = probs.topk(k)
    picked_indexes = picked_indexes.numpy().squeeze()
    probs = probs.numpy().flatten()
    probs = probs / probs.sum()
    word = np.random.choice(picked_indexes, p=probs)

    word = dictionary.decode_text([word.item()])

    return word, hidden





if __name__ == '__main__':
    main()

0it [00:00, ?it/s]

Generating Dictionaries
Building dictionaries...


11it [00:04,  2.23it/s]
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Dictionaries are built - Vocab size is 254732
Loading Embeddings
Loading pretrained embeddings...
Pretrained embeddings are loaded.
[76419600, 360000, 360000, 1200, 1200, 360000, 360000, 1200, 1200, 76419600, 254732]
154538732
cat in the book is the most widely used example of
