In [0]:
#### Author: Arman Kabiri
#### Date: Feb. 18, 2020
#### Email: Arman.Kabiri94@gmail.com

In [2]:
!pip install tensorboardcolab
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
import os
os.chdir('/gdrive/My Drive/NLP_Stuff/My_Language_Model')

In [9]:
!pip install -U tensorboardcolab

Requirement already up-to-date: tensorboardcolab in /usr/local/lib/python3.6/dist-packages (0.0.22)


In [2]:
import argparse
import math
import os.path as path
import os

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

# from tensorboardcolab import TensorBoardColab
# tb = TensorBoardColab()

from CorpusReader import CorpusReader
from Dictionary import Dictionary
from EmbeddingsLoader import EmbeddingsLoader
from Lang_Model import LanguageModel

from torch.utils.tensorboard import SummaryWriter

default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/fashion_mnist_experiment_1')

In [0]:
class Args:
  corpus_train_file='Data/WestburyLab.Wikipedia.Corpus_AdagramTokenized.txt'
  corpus_valid_file=''
  embeddings_file='Data/English_Wiki_1Billion_embeddings.bin'
  output_model_path='Data/model.bin'
  output_id2word_path = 'Data/id2word.txt'
  output_word2id_path = 'Data/word2id.txt'
  n_layers=2
  hidden_size=300
  dropout_probablity=.25
  embeddings_dim=300
  batch_size=64
  seq_len=10
  epochs=2
  lr=0.001
  seed=120
  clip_grad = 5
  print_steps=20
  bidirectional_model=False
  tie_weights=False
  freez_embeddings=False
  gpu=True
  
args = Args()

In [26]:
torch.cuda.is_available()

True

In [0]:
def train(corpus_train_reader, dictionary, model, optimizer, criterion, args, globaliter=0):
    batch_generator = corpus_train_reader.batchify(dictionary, args.batch_size, args.seq_len)
    hidden = model.init_hidden(args.batch_size)

    step = 0
    for x, y in tqdm(batch_generator):

        step += 1
        globaliter += 1
        x = torch.from_numpy(x)
        y = torch.from_numpy(y)

        if args.gpu:
            x = x.cuda()
            y = y.cuda()

        hidden = detach_hidden(hidden)
        model.zero_grad()

        y_hat, hidden = model.forward(x, hidden)

        loss = criterion.forward(y_hat.view(-1, dictionary.get_dic_size()),
                                 y.reshape(args.batch_size * args.seq_len).long())
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad)

        optimizer.step()

        if step % args.print_steps == 0:
            print(f"Step {step} ,     epoch progress = {corpus_train_reader.get_progress()}% ,     Loss = {loss.item()} ,    PPL = {np.exp(loss.item())}")
            tb.save_value('Train Loss', 'train_loss', args.globaliter, loss.item())


In [0]:
def detach_hidden(hidden: tuple):
    return tuple(v.detach() for v in hidden)

In [0]:
def save_dictionary(dictionary: Dictionary, output_id2word_path, output_word2id_path):
    with open(output_word2id_path, 'w') as file:
        for word, word_id in dictionary.word2id.items():
            if '\t' in word:
                exit()
            file.write(f"{word}\t{word_id}\n")

    with open(output_id2word_path, 'w') as file:
        for word in dictionary.id2word:
            file.write(f"{word}\n")

In [0]:
def main():
    
    torch.set_num_threads(8)

    if torch.cuda.is_available():
        if not args.gpu:
            print("WARNING: You have a CUDA device, so you should probably run with --gpu")
    else:
        if args.gpu:
            print("You do not have a GPU device, so you should run CPU without --gpu option.")
            exit()

    torch.manual_seed(args.seed)
    corpus_train_reader = CorpusReader(args.corpus_train_file, 100000000)  # 100MB
    
    dictionary = Dictionary()
    
    # Load the pre-trained Model for fine-tuning
    if path.exists(args.output_model_path):
        print("Loading Dictionaries...")
        dictionary.load_dictionary(id2word_filepath=args.output_id2word_path, word2id_filepath=args.output_word2id_path)
        print("Loading pre-trained Model...")
        model = LanguageModel(path_to_pretrained_model=args.output_model_path, use_gpu=args.gpu)
    
    # Initialize the model
    else:
        print("Generating Dictionaries...")
        dictionary.build_dictionary(corpus_train_reader)

        print("Saving Dictionary...")
        save_dictionary(dictionary, args.output_id2word_path, args.output_word2id_path)

        print("Loading Embeddings...")
        embeddings_matrix = None
        if args.embeddings_file is not None:
            emb_loader = EmbeddingsLoader()
            embeddings_matrix = emb_loader.get_embeddings_matrix(args.embeddings_file, dictionary, args.embeddings_dim)

        model = LanguageModel(n_layers=args.n_layers, hidden_size=args.hidden_size, n_vocab=dictionary.get_dic_size(),
                              input_size=args.embeddings_dim, dropout_prob=args.dropout_probablity,
                              bidirectional=args.bidirectional_model, pret_emb_matrix=embeddings_matrix,
                              freez_emb=args.freez_embeddings, tie_weights=args.tie_weights, use_gpu=args.gpu)

    ###############
    total_param = []
    for p in model.parameters():
        total_param.append(int(p.numel()))
    print(f"Number of Parametes: {sum(total_param)}\n")
    ###############


    # put it into train mode.
    model.train()
    if args.gpu:
        model.cuda()

    # Optimizer and Loss
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss()
    
    
    #Training Model
    print("Training Model...")
    args.globaliter = 0
    for i in range(args.epochs):
        print(f"Epoch {i + 1}:")
        train(corpus_train_reader, dictionary, model, optimizer, criterion, args)
        print(f"Saving Model at epoch {i + 1}...")
        model.save_model(args.output_model_path)

In [0]:
main()

0it [00:00, ?it/s]

Generating Dictionaries...
Building dictionaries...


61it [03:56,  3.88s/it]


Dictionaries are built - Vocab size is 3825945
Saving Dictionary...
Loading Embeddings...
Loading pretrained embeddings...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Pretrained embeddings are loaded.
