# GloVe

In [3]:
from copy import deepcopy
from functools import partial
from typing import Callable, Dict, Generator, List, Tuple

from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import torch
from torch import nn
from torchtext import vocab
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer

from tqdm.auto import tqdm


In [None]:
torch.manual_seed(0)

In [4]:
glove = GloVe(name="6B", dim=300)
len(glove.stoi)


.vector_cache/glove.6B.zip: 862MB [05:46, 2.49MB/s]                                
100%|█████████▉| 399999/400000 [00:29<00:00, 13399.35it/s]


400000

In [5]:
glove.stoi["cat"], glove.itos[5450], glove.vectors[glove.stoi["cat"]].shape

(5450, 'cat', torch.Size([300]))

In [6]:
glove.itos[:5]

['the', ',', '.', 'of', 'to']

In [7]:
words = ["cat", "cats", "dog", "fridge"]
for word in words[1:]:
    similarity = torch.cosine_similarity(
        glove.vectors[glove.stoi[words[0]]].reshape(1, -1),
        glove.vectors[glove.stoi[word]].reshape(1, -1),
    ).item()  # .item() is used to turn a tensor of a single value to a float
    print(f"{words[0]} x {word} = {similarity}")


cat x cats = 0.6815836429595947
cat x dog = 0.6816746592521667
cat x fridge = 0.09630905091762543


In [13]:
word_cat = "cat"

def closest_word() :
    max_similarity = 0
    max_string_similarity = word_cat
    for word in glove.itos:
        similarity =  torch.cosine_similarity(
            glove.vectors[glove.stoi[word]].reshape(1, -1),
            glove.vectors[glove.stoi[word_cat]].reshape(1, -1),
        ).item()  # .item() is used to turn a tensor of a single value to a float
        if (similarity > max_similarity and word != word_cat):
                max_similarity = similarity
                max_string_similarity = word
                #print(f"{max_string_similarity} x {word_cat} = {max_similarity}")
    print(f"{max_string_similarity} x {word_cat} = {max_similarity}")
    

In [15]:
closest_word()

dog x cat = 0.6816746592521667


In [14]:
dataset = load_dataset("imdb")
train_dataset = dataset["train"].train_test_split(
    stratify_by_column="label", test_size=0.2, seed=42
)
test_df = dataset["test"]
train_df = train_dataset["train"]
valid_df = train_dataset["test"]
train_df.shape, valid_df.shape, test_df.shape

Found cached dataset imdb (/home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 79.02it/s]


((20000, 2), (5000, 2), (25000, 2))

In [84]:
tokenizer = get_tokenizer("basic_english", language="en")

def vectorize_text(
    text: str, vocabulary: vocab.Vocab, tokenizer: Callable[[str], List[str]]
) -> torch.Tensor:
    """
    Turn a string into the average of the vectors of its tokens.
    Args:
        text: the input text.
        vocabulary: a pre-trained Vocab object.
        tokenizer: a tokenizer taking a text as input and returning a list of tokens.
    Returns:
        The average tensor over the tokens of the whole text.
    """
    tokens = tokenizer(text)
    vectors = vocabulary.get_vecs_by_tokens(tokens)
    return torch.mean(vectors, dim=0)
    # Your code

In [85]:
text_pipeline = partial(vectorize_text, vocabulary=glove, tokenizer=tokenizer)
assert text_pipeline("some text.").shape == torch.Size([300])

In [86]:
text_pipeline("some text.").shape

torch.Size([300])

In [87]:
X_train = [text_pipeline(text) for text in tqdm(train_df["text"])]
y_train = train_df["label"]
X_valid = [text_pipeline(text) for text in tqdm(valid_df["text"])]
y_valid = valid_df["label"]
X_test = [text_pipeline(text) for text in tqdm(test_df["text"])]
y_test = test_df["label"]

100%|██████████| 20000/20000 [00:37<00:00, 532.40it/s]
100%|██████████| 5000/5000 [00:09<00:00, 512.56it/s]
100%|██████████| 25000/25000 [00:43<00:00, 570.29it/s]


In [None]:
def data_generator(
    X: List[torch.tensor], y: List[int], batch_size: int = 32
) -> Generator[Tuple[torch.Tensor, torch.Tensor], None, None]:
    """
    Yield batches from given input data and labels.
    Args:
        X: a list of tensor (input features).
        y: the corresponding labels.
        batch_size: the size of every batch [32].
    Returns:
        A tuple of tensors (features, labels).
    """
    X, y = shuffle(X, y)
    # Your code

    # yield the the returning values
