In [1]:
import torch
import torch.nn.functional as F
import os
import re
import csv
import random

from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from datetime import datetime
from utils import train, compute_cosine_sim

seed = 265
torch.manual_seed(seed)

device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(f"Training on device {device}.")

Training on device cpu.


In [2]:
TOKENIZER = get_tokenizer('basic_english')
PATH_GENERATED = './generated/'
MIN_FREQ = 100

def read_files(datapath='./data_train/'):
    files = os.listdir(datapath)
    files = [datapath + f for f in files if f.endswith('.txt')]

    texts = []
    for file in files:
        with open(file) as f:
            texts += f.readlines()
    return texts

def tokenize(texts, tokenizer=TOKENIZER):
    tokenized_text = []
    for text in texts:
        tokenized_text += tokenizer(text)
    return tokenized_text

def yield_tokens(texts, tokenizer=TOKENIZER):
    """
    Remove yield tokens from the text before tokenizing
    """

    # Remove words with digits, upper case, and multiple space 
    no_digits = '\w*[0-9]+\w*'
    no_names = '\w*[A-Z]+\w*'
    no_spaces = '\s+'

    for text in texts:
        text = re.sub(no_digits, ' ', text)
        text = re.sub(no_names, ' ', text)
        text = re.sub(no_spaces, ' ', text)
        yield tokenizer(text)

def count_freqs(words, vocab):
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in words:
        freqs[vocab[w]] += 1
    return freqs

def create_vocabulary(lines, min_freq=MIN_FREQ):
    """
    Create a vocabulary (list of known tokens) from a list of strings
    """
    vocab = build_vocab_from_iterator(yield_tokens(lines), min_freq=min_freq, specials=["<unk>"])
    vocab.append_token("i")  # Upper case words like 'I' were removed so we should add it back again.
    vocab.set_default_index(vocab["<unk>"])
    return vocab

def calculate_word_weights(freqs):
    """
    Calculate the weight of each word so that the loss function can weigh 
    frequent words less and unfrequent words more.
    """
    total_words = sum(freqs)
    word_weights = [total_words / (len(freqs)* freq) for freq in freqs]
    word_weights = torch.tensor(word_weights, dtype=torch.float).to(device=device)
    return word_weights


In [3]:
# ----------------------- Tokenize texts -------------------------------

if os.path.isfile(PATH_GENERATED + "words_train.pt"):
    words_train = torch.load(PATH_GENERATED + "words_train.pt", map_location=torch.device(device))
    words_val   = torch.load(PATH_GENERATED + "words_val.pt", map_location=torch.device(device))
    words_test  = torch.load(PATH_GENERATED + "words_test.pt", map_location=torch.device(device))
        
else:
    lines_books_train = read_files('./data_train/')
    lines_books_val   = read_files('./data_val/')
    lines_books_test  = read_files('./data_test/')

    words_train = tokenize(lines_books_train)
    words_val   = tokenize(lines_books_val)
    words_test  = tokenize(lines_books_test)
    
    torch.save(words_train, PATH_GENERATED + "words_train.pt")
    torch.save(words_val, PATH_GENERATED + "words_val.pt")
    torch.save(words_test, PATH_GENERATED + "words_test.pt")



# ----------------------- Create vocabulary ----------------------------

VOCAB_FNAME = "vocabulary.pt"
if os.path.isfile(PATH_GENERATED + VOCAB_FNAME):
    vocab = torch.load(PATH_GENERATED + VOCAB_FNAME, map_location=torch.device(device))
else:
    vocab = create_vocabulary(lines_books_train, min_freq=MIN_FREQ)
    torch.save(vocab, PATH_GENERATED + VOCAB_FNAME)
    


# ------------------------ Quick analysis ------------------------------
VOCAB_SIZE = len(vocab)
freqs = count_freqs(words_train, vocab)
occurences = [(f.item(), w) for (f, w) in zip(freqs, vocab.lookup_tokens(range(VOCAB_SIZE)))]
word_weigts = calculate_word_weights(freqs)

In [4]:
print("Total number of words in the training dataset:     ", len(words_train))
print("Total number of words in the validation dataset:   ", len(words_val))
print("Total number of words in the test dataset:         ", len(words_test))
print("Number of distinct words in the training dataset:  ", len(set(words_train)))
print("Number of distinct words kept (vocabulary size):   ", VOCAB_SIZE)

print(f"\nThe 5 most frequent words:\n {occurences[:5]}")

Total number of words in the training dataset:      2684706
Total number of words in the validation dataset:    49526
Total number of words in the test dataset:          124152
Number of distinct words in the training dataset:   52105
Number of distinct words kept (vocabulary size):    1880

The 5 most frequent words:
 [(433907, '<unk>'), (182537, ','), (151278, 'the'), (123727, '.'), (82289, 'and')]


In [5]:
CONTEXT_SIZE = 5
not_words = [',', '.', '(', ')', '?', '!', '<unk>']

# ---------------- Define context / target pairs -----------------------
def create_dataset(text, vocab, context_size=CONTEXT_SIZE):
    """
    Create a pytorch dataset of context / target pairs from a text
    """
    
    # Transform each word to its index in the vocabulary.
    txt = [vocab[w] for w in text]

    n_text = len(text)
    contexts = []
    targets = []
    for i in range(n_text - context_size):
        
        t = txt[i + context_size]
        # We only want to guess actual words.
        if vocab.lookup_token(t) in not_words: continue 
        c = txt[i:i + context_size]
        
        targets.append(t) 
        contexts.append(torch.tensor(c).to(device=device))
            
    contexts = torch.stack(contexts)
    targets  = torch.tensor(targets).to(device=device)
    return TensorDataset(contexts, targets)

In [6]:
def load_dataset(words, vocab, fname):
    """
    Load dataset if already generated, otherwise, create it and save it
    """
    
    if os.path.isfile(PATH_GENERATED + fname):
        dataset = torch.load(PATH_GENERATED + fname, map_location=torch.device(device))
    else:
        dataset = create_dataset(words, vocab)
        torch.save(dataset, PATH_GENERATED + fname)
    return dataset

data_train = load_dataset(words_train, vocab, "data_train.pt")
data_val   = load_dataset(words_val, vocab, "data_val.pt")
data_test  = load_dataset(words_test, vocab, "data_test.pt")

In [7]:
class Word2Vec(nn.Module):
    
    def __init__(self, embedding, context_size=CONTEXT_SIZE):
        super().__init__()
        
        (vocab_size, embedding_dim) = embedding.weight.shape
        self.embedding = embedding

        self.fc1 = nn.Linear(embedding_dim*context_size, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        out = self.embedding(x)
        out = F.relu(self.fc1(torch.flatten(out, 1)))        
        out = self.fc2(out)
        return out

In [8]:
embedding_dims = [10, 12, 16]

hparams = [{
    'embedding_dim': em
 } for em in embedding_dims]

print(f"We are testing {len(hparams)} different hyper parameters.")

train_loader = DataLoader(data_train, batch_size=128, shuffle=True)
val_loader   = DataLoader(data_val, batch_size=128, shuffle=True)
test_loader = DataLoader(data_test, batch_size=128, shuffle=True)

train_val_loader = DataLoader(ConcatDataset([data_train, data_val]), batch_size=128, shuffle=True)

We are testing 3 different hyper parameters.


In [11]:
def train_all_models(hparams=hparams, n_epochs=5):

    # ---------------- Train many models ----------------------- 
    models = []
    train_sims = []
    val_sims = []

    for param in hparams:
        print(f'Now training with parameters {param}')

        torch.manual_seed(seed)
        embedding = nn.Embedding(VOCAB_SIZE, param['embedding_dim'])
        torch.manual_seed(seed)
        model = Word2Vec(embedding).to(device=device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        loss_fn = nn.CrossEntropyLoss(weight=word_weigts)

        train(n_epochs, optimizer, model, loss_fn, train_loader, device)

        train_sim = compute_cosine_sim(model, train_loader, device)
        print(f'Train similarity: {train_sim}')
        val_sim = compute_cosine_sim(model, val_loader, device)
        print(f'Val similarity:   {val_sim}')
        
        models.append(model)
        train_sims.append(train_sim)
        val_sims.append(val_sim)
        print()
    
    return models, train_sims, val_sims

def select_best_model(models, val_sims, hparams=hparams, n_epochs=25):

    # ---------------- Select the best model ----------------------- 
    best_idx   = val_sims.index(max(val_sims))
    best_model = models[best_idx]
    best_param = hparams[best_idx]
    print(f'The best model had these parameters: {best_param}.')

    # ---------------- Retrain the best performing model for longer on more data ----------------------- 
    embedding = nn.Embedding(VOCAB_SIZE, best_param['embedding_dim'])
    torch.manual_seed(seed)
    best_model = Word2Vec(embedding).to(device=device)
    optimizer = optim.Adam(best_model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss(weight=word_weigts)
    train(n_epochs, optimizer, best_model, loss_fn, train_val_loader, device)

    return best_model

In [17]:
# ----------------------- Best Model -------------------------------
if os.path.isfile(PATH_GENERATED + 'best_model.pt'):
    best_model = torch.load(PATH_GENERATED + 'best_model.pt', map_location=torch.device(device))
else:
    models, train_sims, val_sims = train_all_models()
    best_model = select_best_model(models, val_sims)
    torch.save(best_model, PATH_GENERATED + 'best_model.pt')

torch.save(best_model.embedding, PATH_GENERATED + 'embedding.pt')

# ----------------------- Evaluate Best Model -------------------------------
test_sim = compute_cosine_sim(best_model, test_loader, device)
print(f"Similarity on test set: {round(float(test_sim), 2)}")

Similarity on test set: 0.53


In [18]:
# ----------------------- Calculate Cosine Similarity Matrix -------------------------------
cosineSimilarity = nn.CosineSimilarity(dim=2)
embedding = best_model.embedding
embedding_data = best_model.embedding.weight.data
cos_matrix = cosineSimilarity(embedding_data.unsqueeze(0), embedding_data.unsqueeze(1))

In [25]:
# ----------------------- Randomly select 10 not too frequent words -------------------------------
random.seed(seed)
selected_words = random.sample([word for word in vocab.lookup_tokens(range(100,VOCAB_SIZE)) if word not in not_words], 10)
selected_indecies = vocab.lookup_indices(selected_words)
similar_words = []

for idx in selected_indecies:
    word_matrix = cos_matrix[idx].clone()
    word_matrix[idx] = -1 # Every word is most like itself 
    similar_words.append((vocab.lookup_token(torch.argmax(word_matrix)), torch.max(word_matrix)))

selected_similar_list = [(selected, similar, round(float(value), 2)) for (selected, (similar, value)) in zip(selected_words, similar_words)]

print("Ten randomly selected words with their most similar words:\n")
for w1, w2, sim in selected_similar_list:
    print(f"{w1}, {w2} with similarity {sim}")

Ten randomly selected words with their most similar words:

branch, details with similarity 0.8
thrown, begins with similarity 0.75
reach, filled with similarity 0.71
provided, beauty with similarity 0.8
week, sea with similarity 0.78
becomes, are with similarity 0.75
motionless, blue with similarity 0.79
repeated, promise with similarity 0.82
wounded, brother with similarity 0.76
finally, already with similarity 0.88


In [None]:
# ----------------------- Convert embedding to tsv files -------------------------------
with open(PATH_GENERATED + 'vocab.tsv', 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for word in vocab.lookup_tokens(range(VOCAB_SIZE)):
        writer.writerow([word])
with open(PATH_GENERATED + 'embedding.tsv', 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    for word in embedding_data:
        word = [float(w) for w in word]
        writer.writerow(word)