In [10]:
from pathlib import Path
import re
import spacy
from collections import Counter






In [11]:
train_data = Path('datasets/wikitext-103/wiki.train.tokens').read_text(encoding='utf-8')
val_data = Path('datasets/wikitext-103/wiki.valid.tokens').read_text(encoding='utf-8')
test_data = Path('datasets/wikitext-103/wiki.test.tokens').read_text(encoding='utf-8')

In [12]:
heading_pattern = '( \n \n = [^=]*[^=] = \n \n )'

In [13]:
# Split out train headings and articles
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]

# Split out validation headings and articles
val_split = re.split(heading_pattern, val_data)
val_headings = [x[7:-7] for x in val_split[1::2]]
val_articles = [x for x in val_split[2::2]]

# Split out test headings and articles
test_split = re.split(heading_pattern, test_data)
test_headings = [x[7:-7] for x in test_split[1::2]]
test_articles = [x for x in test_split[2::2]]


train_data = [i.split('. ') for i in train_articles]
val_data = [i.split('. ') for i in val_articles]
test_data = [i.split('. ') for i in test_articles]



In [14]:
test_data = [item for sublist in test_data for item in sublist]
val_data = [item for sublist in val_data for item in sublist]
train_data = [item for sublist in train_data for item in sublist]

In [15]:
train_set, test_set = train_data, val_data

In [16]:
nlp = spacy.load('en_core_web_sm')
cw = Counter()
cl = Counter()

# Count words:
for sample in train_set + test_set: cw += Counter([token.text for token in nlp(sample)])
# Count lemma:
for sample in train_set + test_set: cl += Counter([token.lemma_ for token in nlp(sample)])

In [59]:
import os
import pickle
import torch

class Vocab(object):

    def __init__(self, filename='', load=False, threshold=500):
        if load:
            assert os.path.exists(filename), "Vocab file does not exist at " + filename

            self.id2word, self.word2id = self.load(filename)
            self.size = len(self.id2word)
            self.threshold = threshold
            self.wordCounter = None
        else:
            self.id2word, self.word2id = {}, {}
            self.size = 0
            self.threshold = threshold
            # We always add some custom tokens into the vocabulary.
            self.add_words(
                {'<PAD>': float('inf'), '<UNK>': float('inf'),'<MSK>' : 103})
        self.word_embed = None

    def add_words(self, counterOfTokens):
        for item, value in counterOfTokens.items():
            if value >= self.threshold:
                if item not in self.word2id:
                    # add it to the vocab
                    self.word2id[item] = self.size
                    self.id2word[self.size] = item
                    self.size += 1

    def load(self, filename):
        with open(filename, 'rb') as infile:
            id2word = pickle.load(infile)
            word2id = {word:id for id, word in id2word.items()}
            self.id2word, self.word2id = id2word, word2id
            self.size = len(self.id2word)

        return id2word, word2id

    def save(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
           
        with open(filename, 'wb') as outfile:
            pickle.dump(self.id2word, outfile)

    def __len__(self):
        return self.size


    def init_word_embed(self, cfg, cache_dir='datasets/.word_vectors_cache'):
        if cfg['word_vectors'] == 'Word2Vec':
            from torchnlp.word_to_vector import FastText
            all_word_vector = FastText(language=cfg['language'], cache=cache_dir, aligned=True)
        else:
            raise NotImplementedError('No word_vectors found which are called {}.'.format(cfg['word_vectors']))

        # The the vectors only correspond to lower character words:
        all_words = [word.lower() for word in list(self.word2id.keys())]
        weights = all_word_vector[all_words]
        
        word_embed = torch.nn.Embedding(*weights.shape, _weight=weights)
        #if cfg['device'] == 'cuda':
        #    word_embed.cuda()

        self.word_embed = word_embed
        self.embed_size = weights.shape[1]

    def words2vecs(self, words: list):
        if not self.word_embed:
            raise AttributeError("The word embeddings aren't initialized yet.")
        else:
            vecs = self.word_embed(torch.tensor(self.map(words), requires_grad=False))
        return vecs

    def one_hot_ids2vecs(self, ids):
        vecs = self.word_embed(ids)
        return vecs

    def map(self, token_list):
        """
        Map a list of tokens to their ids.
        """
        return [self.word2id[w] if w in self.word2id else self.word2id['<UNK>'] for w in token_list]

    def unmap(self, idx_list):
        """
        Unmap ids back to tokens.
        """
        return [self.id2word[idx] for idx in idx_list]
    
def get_pos_vocab():
    """
    Function to set up a part of speech vocabulary handcrafed.
    """
    pos_id2word = {0: '<PAD>', 1: '<UNK>', 2: 'DET', 3: 'PROPN', 4: 'VERB', 5: 'PART', 6: 'ADJ', 7: 'PUNCT', 8: 'CCONJ',
                   9: 'ADP', 10: 'PRON', 11: 'NOUN', 12: 'ADV', 13: 'INTJ', 14: 'NUM', 15: 'X', 16: 'SYM'}
    pos_word2id = {word: id for id, word in pos_id2word.items()}
    pos_vocab = Vocab()
    pos_vocab.id2word = pos_id2word
    pos_vocab.word2id = pos_word2id
    pos_vocab.size = len(pos_vocab.id2word)
    
    return pos_vocab

In [60]:
vocab = Vocab()
lemma_vocab = Vocab()


# prepare vocab
vocab.add_words(cw)
#cfg['input_dimension'] = 300

lemma_vocab.add_words(cl)
pos_vocab = get_pos_vocab()

In [61]:
vocab.__len__()

13234

In [62]:
vocab.save('word_vocab_500.json')

In [63]:
lemma_vocab.save('lemma_vocab_500.json')

In [26]:
pos_vocab.save('pos_vocab.json')