In [9]:
# Author: Arman Kabiri
# Date: Feb. 18, 2020
# Email: Arman.Kabiri94@fmail.com

In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import gensim
import logging
from tqdm import tqdm
import typing

%matplotlib inline

In [37]:
class Dictionary:
    
    def __init__(self):
        self.word2id = dict()
        self.id2word = list()
    
    def build_dictionary(self, input_file:str) -> (dict,list):
    
        logging.info("Building dictionaries...")
        self.word2id = dict()
        self.id2word = list()
        reader = load_corpus_inchunk(input_file)

        for chunk in tqdm(reader):
            words = chunk.split(' ')
            for word in words:
                if word not in word2id:
                    id2word.append(word)
                    word2id[word] = len(id2word) - 1

        logging.info(f"Dictionaries are built - Vocab size is {len(id2word)}")
    
    def encode_text(self, text:str) -> list:
        return [self.word2id[word] for word in text.split(' ')]
    
    def decode_text(self, sequence:list) -> str:
        ' '.join([self.id2word[idx] for idx in sequence])

In [38]:
def load_corpus_inchunk(input_file:str, chunk:int=100000000) -> str:
   
    with open(input_file, 'r') as f:
        while True:
            buf = f.read(chunk)
            if not buf:
                break

            # make sure we end on a space (word boundary)
            while not str.isspace(buf[-1]):
                ch = f.read(1)
                if not ch:
                    break
                buf += ch

            yield buf
        yield '' #handle the scene that the file is empty

In [39]:
def load_pretrained_embeddings(input_file:str) -> dict:
    
    logging.info("Loading pretrained embeddings...")
    vectors = gensim.models.KeyedVectors.load_word2vec_format(fname=input_file, binary=True)
    logging.info("Pretrained embeddings are loaded.")
    
    return vectors

In [72]:
def batchify(reader:typing.Generator, dictionary:Dictionary, batch_size:int, seq_len:int):
    
    reader = load_corpus_inchunk(file_path)
    left_from_previous_chunk = []
    
    for chunk in reader:
        
        encoded_text = dictionary.encode_text(chunk)
        encoded_text = left_from_previous_chunk + encoded_text
        
        # -(1*batch_size) is for y of the last sample per chunk
        n_batches = (len(encoded_text)-batch_size) // (batch_size*seq_len)
        left_over = (len(encoded_text)-batch_size) % (batch_size*seq_len)
        
        if left_over != 0:
            left_from_previous_chunk = encoded_text[-left_over:]
            encoded_text = encoded_text[:-left_over]
        else:
             left_from_previous_chunk = []
        
        encoded_text = np.array(encoded_text)
        encoded_text = encoded_text.reshape((batch_size,-1))
        
        for i in range(0, encoded_text.shape[1]-1, seq_len):
            
            x = encoded_text[:, i : i+seq_len]
            y = encoded_text[:, i+1 : i+seq_len+1]
            yield x,y
        