In [3]:
from pathlib import Path
import re
import spacy
from collections import Counter






In [4]:
train_data = Path('datasets/wikitext-103/wiki.train.tokens').read_text(encoding='utf-8')
val_data = Path('datasets/wikitext-103/wiki.valid.tokens').read_text(encoding='utf-8')
test_data = Path('datasets/wikitext-103/wiki.test.tokens').read_text(encoding='utf-8')

In [5]:
heading_pattern = '( \n \n = [^=]*[^=] = \n \n )'

In [6]:
# Split out train headings and articles
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]

# Split out validation headings and articles
val_split = re.split(heading_pattern, val_data)
val_headings = [x[7:-7] for x in val_split[1::2]]
val_articles = [x for x in val_split[2::2]]

# Split out test headings and articles
test_split = re.split(heading_pattern, test_data)
test_headings = [x[7:-7] for x in test_split[1::2]]
test_articles = [x for x in test_split[2::2]]


train_data = [i.split('. ') for i in train_articles]
val_data = [i.split('. ') for i in val_articles]
test_data = [i.split('. ') for i in test_articles]



In [7]:
test_data = [item for sublist in test_data for item in sublist]
val_data = [item for sublist in val_data for item in sublist]
train_data = [item for sublist in train_data for item in sublist]

In [8]:
train_set, test_set = train_data, val_data

In [19]:
len(train_set+test_set)

['The Tower Building of the Little Rock Arsenal , also known as U.S',
 'Arsenal Building , is a building located in MacArthur Park in downtown Little Rock , Arkansas ',
 "Built in 1840 , it was part of Little Rock 's first military installation ",
 'Since its decommissioning , The Tower Building has housed two museums ',
 'It was home to the Arkansas Museum of Natural History and Antiquities from 1942 to 1997 and the MacArthur Museum of Arkansas Military History since 2001 ',
 'It has also been the headquarters of the Little Rock Æsthetic Club since 1894 ',
 '\n The building receives its name from its distinct octagonal tower ',
 'Besides being the last remaining structure of the original Little Rock Arsenal and one of the oldest buildings in central Arkansas , it was also the birthplace of General Douglas MacArthur , who became the supreme commander of US forces in the South Pacific during World War II ',
 'It was also the starting place of the Camden Expedition ',
 'In 2011 it was na

In [29]:
nlp = spacy.load('en_core_web_sm')
cw = Counter()
cl = Counter()
import time
# Count words:
i = 0
program_starts = time.time()
for sample in train_set[:10000]: 
    if len(sample) > 64:
        sample = sample[:64]
    for token in nlp(sample):
        cw += Counter([token.text])
        i+=1
    if i%1000 == 0:
        now = time.time()
        print("It has been {0} seconds since the loop started".format(now - program_starts))
# Count lemma:
for sample in train_set[:10000]: cl += Counter([token.lemma_ for token in nlp(sample)])

It has been 18.364887237548828 seconds since the loop started
It has been 27.958022117614746 seconds since the loop started
It has been 40.20744061470032 seconds since the loop started
It has been 44.20855665206909 seconds since the loop started
It has been 47.1556830406189 seconds since the loop started
It has been 66.557368516922 seconds since the loop started
It has been 73.77625155448914 seconds since the loop started
It has been 95.71356153488159 seconds since the loop started
It has been 115.01347851753235 seconds since the loop started
It has been 127.08464574813843 seconds since the loop started


In [32]:
cw

Counter({'The': 1952,
         'Tower': 15,
         'Building': 12,
         'of': 2890,
         'the': 5913,
         'Little': 27,
         'Rock': 24,
         'Arsenal': 7,
         ',': 5675,
         'also': 378,
         'known': 95,
         'as': 645,
         'U.': 1,
         'is': 997,
         'a': 2100,
         'building': 28,
         'located': 38,
         'in': 2020,
         'MacArthur': 3,
         'Park': 16,
         'do': 24,
         'Built': 3,
         '1840': 3,
         'it': 298,
         'was': 1550,
         'part': 76,
         "'s": 815,
         'first': 239,
         'military': 26,
         'ins': 4,
         'Since': 20,
         'its': 135,
         'decommissioning': 1,
         'has': 331,
         'housed': 4,
         'two': 196,
         'mu': 3,
         'It': 283,
         'home': 52,
         'to': 1788,
         'Arkansas': 9,
         'Museum': 14,
         'Natural': 5,
         'History': 23,
         'and': 1993,
         'Antiqu': 

In [35]:
import os
import pickle
import torch

class Vocab(object):

    def __init__(self, filename='', load=False, threshold=5):
        if load:
            assert os.path.exists(filename), "Vocab file does not exist at " + filename

            self.id2word, self.word2id = self.load(filename)
            self.size = len(self.id2word)
            self.threshold = threshold
            self.wordCounter = None
        else:
            self.id2word, self.word2id = {}, {}
            self.size = 0
            self.threshold = threshold
            # We always add some custom tokens into the vocabulary.
            self.add_words(
                {'<PAD>': float('inf'), '<UNK>': float('inf'),'<MSK>' : 103})
        self.word_embed = None

    def add_words(self, counterOfTokens):
        for item, value in counterOfTokens.items():
            if value >= self.threshold:
                if item not in self.word2id:
                    # add it to the vocab
                    self.word2id[item] = self.size
                    self.id2word[self.size] = item
                    self.size += 1

    def load(self, filename):
        with open(filename, 'rb') as infile:
            id2word = pickle.load(infile)
            word2id = {word:id for id, word in id2word.items()}
            self.id2word, self.word2id = id2word, word2id
            self.size = len(self.id2word)

        return id2word, word2id

    def save(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
           
        with open(filename, 'wb') as outfile:
            pickle.dump(self.id2word, outfile)

    def __len__(self):
        return self.size


    def init_word_embed(self, cfg, cache_dir='datasets/.word_vectors_cache'):
        if cfg['word_vectors'] == 'Word2Vec':
            from torchnlp.word_to_vector import FastText
            all_word_vector = FastText(language=cfg['language'], cache=cache_dir, aligned=True)
        else:
            raise NotImplementedError('No word_vectors found which are called {}.'.format(cfg['word_vectors']))

        # The the vectors only correspond to lower character words:
        all_words = [word.lower() for word in list(self.word2id.keys())]
        weights = all_word_vector[all_words]
        
        word_embed = torch.nn.Embedding(*weights.shape, _weight=weights)
        #if cfg['device'] == 'cuda':
        #    word_embed.cuda()

        self.word_embed = word_embed
        self.embed_size = weights.shape[1]

    def words2vecs(self, words: list):
        if not self.word_embed:
            raise AttributeError("The word embeddings aren't initialized yet.")
        else:
            vecs = self.word_embed(torch.tensor(self.map(words), requires_grad=False))
        return vecs

    def one_hot_ids2vecs(self, ids):
        vecs = self.word_embed(ids)
        return vecs

    def map(self, token_list):
        """
        Map a list of tokens to their ids.
        """
        return [self.word2id[w] if w in self.word2id else self.word2id['<UNK>'] for w in token_list]

    def unmap(self, idx_list):
        """
        Unmap ids back to tokens.
        """
        return [self.id2word[idx] for idx in idx_list]
    
def get_pos_vocab():
    """
    Function to set up a part of speech vocabulary handcrafed.
    """
    pos_id2word = {0: '<PAD>', 1: '<UNK>', 2: 'DET', 3: 'PROPN', 4: 'VERB', 5: 'PART', 6: 'ADJ', 7: 'PUNCT', 8: 'CCONJ',
                   9: 'ADP', 10: 'PRON', 11: 'NOUN', 12: 'ADV', 13: 'INTJ', 14: 'NUM', 15: 'X', 16: 'SYM'}
    pos_word2id = {word: id for id, word in pos_id2word.items()}
    pos_vocab = Vocab()
    pos_vocab.id2word = pos_id2word
    pos_vocab.word2id = pos_word2id
    pos_vocab.size = len(pos_vocab.id2word)
    
    return pos_vocab

In [34]:
vocab = Vocab()
lemma_vocab = Vocab()


# prepare vocab
vocab.add_words(cw)
#cfg['input_dimension'] = 300

lemma_vocab.add_words(cl)
pos_vocab = get_pos_vocab()

In [40]:
vo = Vocab()

In [41]:
vo.load('word_vocab.json')

({0: '<PAD>',
  1: '<UNK>',
  2: '<MSK>',
  3: 'The',
  4: 'Tower',
  5: 'Building',
  6: 'of',
  7: 'the',
  8: 'Little',
  9: 'Rock',
  10: 'Arsenal',
  11: ',',
  12: 'also',
  13: 'known',
  14: 'as',
  15: 'U.S',
  16: 'is',
  17: 'a',
  18: 'building',
  19: 'located',
  20: 'in',
  21: 'MacArthur',
  22: 'Park',
  23: 'downtown',
  24: 'Arkansas',
  25: 'Built',
  26: '1840',
  27: 'it',
  28: 'was',
  29: 'part',
  30: "'s",
  31: 'first',
  32: 'military',
  33: 'installation',
  34: 'Since',
  35: 'its',
  36: 'decommissioning',
  37: 'has',
  38: 'housed',
  39: 'two',
  40: 'museums',
  41: 'It',
  42: 'home',
  43: 'to',
  44: 'Museum',
  45: 'Natural',
  46: 'History',
  47: 'and',
  48: 'Antiquities',
  49: 'from',
  50: '1942',
  51: '1997',
  52: 'Military',
  53: 'since',
  54: '2001',
  55: 'been',
  56: 'headquarters',
  57: 'Æsthetic',
  58: 'Club',
  59: '1894',
  60: '\n ',
  61: 'receives',
  62: 'name',
  63: 'distinct',
  64: 'octagonal',
  65: 'tower',
  66: 

In [43]:
vo.__len__()

196145