In [1]:
%%capture
!pip install transformers datasets

In [2]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import DataLoader
from tabulate import tabulate
from datasets import load_dataset

from tqdm.notebook import tqdm
from transformers import BertTokenizer

This is a template of the notebook that you should complete and enrich with your own code.

First cells will be the same than the ones of the lab on text convolution.

# Data loading


In [3]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


# Pre-processing / Tokenization

This is a very important step. It maybe boring but very important. In this session we will be lazy, but in real life, the time spent on inspecting and cleaning data is never wasted. It is true for text, but also for everything.



In PyTorch, everything is tensor. Words are replaced by indices. A sentence, is therefore a sequence of indices (long integers). In the first HW, you constructed a `WhiteSpaceTokenizer`. Here we will use an already built tokenizer. It is more appropriate to transformers. It relies on sub-word units, and converts everything in lower case. This is not always the best choice, but here it will be sufficient. To quote the documentation, this tokenizer allows you to:
- Tokenize (splitting strings in sub-word token strings), converttokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers).
- Add new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece…).
- Manage special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization.

Here we are going to use the tokenizer from the well known Bert model, that we can directly download.

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)




In [5]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x


Same celel than in the lab session.

🚧 **TODO** 🚧

Read the documentation about HuggingFace dataset and complete the code below.
You should:
- Shuffle the dataset
- For computational reasons, use only a total of **5000 samples**.
- Tokenize the dataset with the `preprocessing_fn`. (*Hint: use the `Dataset.map` method from HuggingFace*).
- Keep only columns `review_ids` and `label`.
- Make a train/validation split, (**80% / 20%**). Call these dataset `train_set` and `valid_set`.


## Q1

In [6]:
n_samples = 5000  # the number of training example

# We first shuffle the data !
data_shuffled = dataset.shuffle()

# Select 5000 samples
data_shuffled_sampled = data_shuffled.select(range(n_samples))

# Tokenize the dataset
data_tokenized = data_shuffled_sampled.map(lambda x: preprocessing_fn(x, tokenizer))

# Remove useless columns
data_tokenized = data_tokenized.remove_columns(["review", "sentiment"])

# Split the train and validation
train_set = data_tokenized.train_test_split(test_size=0.2)["train"]
valid_set = data_tokenized.train_test_split(test_size=0.2)["test"]

document_train_set = train_set["review_ids"]
document_valid_set = valid_set["review_ids"]


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

# Q2

In [7]:
#A revoir car les C n'ont pas toujours la même taille, pour l'instant je l'utilise pour avancer

def extract_words_contexts(w, R):
    ids = []
    pos_con = []
    n = len(w)

    for i in range(n):
        ids.append(w[i])
        context = []
        left_context = w[max(0, i - R):i]  # words before w[i]
        right_context = w[i + 1:min(n, i + R + 1)]  # words after w[i]
        context = left_context + right_context
        if len(context) < 2 * R:
          if isinstance(context, tuple):
                context = list(context)
          context += [0] * (2 * R - len(context))
        pos_con.append(context[:2 * R])

    return ids, pos_con


# Q3

In [8]:
def flatten_dataset_to_list(dataset, R):
    all_ids = []
    all_contexts = []

    for document in dataset:
        ids, contexts = extract_words_contexts(document, R)
        all_ids.extend(ids)
        all_contexts.extend(contexts)

    return all_ids, all_contexts

In [9]:
data_tokenized_flattened = flatten_dataset_to_list(data_tokenized["review_ids"], 10)

In [10]:
len(data_tokenized_flattened)

2

In [11]:
R, K = 10, 2

# Q4

In [12]:
train_ids, train_context = flatten_dataset_to_list(document_train_set, R)
valid_ids, valid_context = flatten_dataset_to_list(document_valid_set, R)

In [13]:
print(len(document_train_set))
print(len(document_valid_set))

4000
1000


# Q5

In [14]:
from torch.utils.data import Dataset


class sentiment(Dataset):
    def __init__(self, words, contexts):
      self.words = words
      self.contexts = contexts

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx: int):
        return self.words[idx], self.contexts[idx]

In [15]:
train_set = sentiment(train_ids, train_context)
valid_set = sentiment(valid_ids, valid_context)

# Q6

In [16]:
import random

def collate_fn(batch, vocab_size, R, K):
    word_ids, pos_con_ids = zip(*batch)

    # Convert word ids and positive context ids to tensors
    word_ids = torch.tensor(word_ids)
    pos_con_ids = torch.tensor(pos_con_ids)

    # Generate negative context by sampling from the vocabulary
    neg_con_ids = torch.tensor([random.sample(range(vocab_size), 2 * R * K) for _ in range(len(word_ids))])

    return {"word_id": word_ids, "positive_context_ids": pos_con_ids, "negative_context_ids": neg_con_ids}


# Q7

In [17]:
batch_size = 10
dataloader = DataLoader(
        dataset=data_tokenized_flattened, batch_size=batch_size, collate_fn= lambda x : collate_fn(x, R, K)
    )

In [18]:
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=lambda x: collate_fn(x, vocab_size=n_samples, R=R, K=5))

valid_loader = DataLoader(valid_set, batch_size=32, shuffle=False, collate_fn=lambda x: collate_fn(x, vocab_size=n_samples, R=R, K=5))

# Q8

In [19]:
i = 0
for batch_data in train_loader:
    print(f"R = {R}, K = {K}")
    print(batch_data)
    #print(f"Word IDs shape: {batch['word_id'].shape}")
    #print(f"Positive Context IDs shape: {batch['positive_context_ids'].shape}")
    #print(f"Negative Context IDs shape: {batch['negative_context_ids'].shape}")
    print("Word IDs:", batch_data['word_id'])
    print("Positive Context:", batch_data['positive_context_ids'])
    print("Negative Context:", batch_data['negative_context_ids'])
    i+=1
    if i == 2:
      break


R = 10, K = 2
{'word_id': tensor([ 2017,  2245,  2819,  1998,  1056,  2023,  2428,  5537,  1999,  1010,
         1998, 14726, 14976,   999,  2024,  3057,  1012,  1997,  1997,  2299,
         1011,  1037,  1037,  1010,  2017,  1013,  2155,  1996,  2062,  2100,
         5488,  1997]), 'positive_context_ids': tensor([[ 1012,  2045,  2024,  2035, 11901,  1997, 21635, 12817,  2000,  2562,
         11770,  1010,  1998,  1996,  1006,  2512,  1011,  6052,  1007,  6050],
        [ 2236,  1012,  2043,  1045,  2387,  2023,  2005,  5096,  1010,  1045,
          2009,  2001,  1037, 10036, 10973,  2125,  1997, 16113,  1012,  2053],
        [ 5064,  2028,  9020,  2000, 19815,  2039,  2006,  2585, 23680, 18083,
          1998,  2507,  2032,  1037,  9152,  2361,  1012,  2049,  2004,  2065],
        [ 8022,  1006,  2004,  6382, 21909,  1007,  1010,  1037, 10571,  9431,
          4770, 15876, 22516,  2099,  1012,  2002,  8480,  2007,  3376,  2684],
        [ 5691,  2006,  6833,  2066,  6480,  1010,  2087

# Q9



In [20]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, word_id, pos_context_ids, neg_context_ids):
        word_embed = self.word_embeddings(word_id)
        pos_context_embed = self.context_embeddings(pos_context_ids)
        neg_context_embed = self.context_embeddings(neg_context_ids)
        pos_dot_product = torch.bmm(pos_context_embed, word_embed.unsqueeze(2)).squeeze(2)

        neg_dot_product = torch.bmm(neg_context_embed, word_embed.unsqueeze(2)).squeeze(2)

        pos_similarity = torch.sigmoid(pos_dot_product)  # (batch_size, 2R)
        neg_similarity = torch.sigmoid(neg_dot_product)  # (batch_size, 2R * K)

        pos_loss = -torch.log(pos_similarity + 1e-8).sum(1)  # Sum over all positive contexts
        neg_loss = -torch.log(1 - neg_similarity + 1e-8).sum(1)  # Sum over all negative contexts
        loss = pos_loss + neg_loss

        return loss.mean()

# Q10

In [37]:
def train_model(model, train_loader, E, epochs, lr =0.001):
    model = Word2Vec(vocab_size=n_samples, embedding_dim=E)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            word_id = batch['word_id']
            positive_context_ids = batch['positive_context_ids']
            negative_context_ids = batch['negative_context_ids']

            word_id = word_id.clamp(0, vocab_size - 1) # Clip to the range [0, vocab_size-1]
            positive_context_ids = positive_context_ids.clamp(0, vocab_size - 1)
            negative_context_ids = negative_context_ids.clamp(0, vocab_size - 1)

            optimizer.zero_grad()
            loss = model(word_id, positive_context_ids, negative_context_ids)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {avg_loss:.4f}')


In [41]:
# Define hyperparameters
vocab_size = n_samples  # Size of the vocabulary
embedding_dim = 100  # Dimension of the word embeddings
batch_size = 64  # Batch size (B)
epochs = 10  # Number of epochs (E)

# Create the Word2Vec model
model = Word2Vec(vocab_size, embedding_dim)

# Assuming `train_dataset` is already prepared
# Train the model
train_model(model,train_loader, embedding_dim, epochs)


Epoch [1/10], Loss: 63.3057
Epoch [2/10], Loss: 20.7194
Epoch [3/10], Loss: 19.3998
Epoch [4/10], Loss: 19.0619
Epoch [5/10], Loss: 18.9144
Epoch [6/10], Loss: 18.8256
Epoch [7/10], Loss: 18.7714
Epoch [8/10], Loss: 18.7369
Epoch [9/10], Loss: 18.7125
Epoch [10/10], Loss: 18.6827


# Q11

In [57]:
import torch.nn.functional as F

def validate_word2vec(model, test_loader, R, K, B): # updated here
    """
    Validates the Word2Vec model on a test set by checking how well
    the embeddings of words align with their positive and negative contexts.

    Args:
        model (torch.nn.Module): The trained Word2Vec model.
        test_loader (torch.utils.data.DataLoader): The test data loader. # updated here
        R (int): Radius of the context window.
        K (int): Number of negative samples per positive context word.
        B (int): Batch size.

    Returns:
        float: The average cosine similarity for positive contexts (C+).
        float: The average cosine similarity for negative contexts (C-).
    """

    # test_loader = DataLoader(test_dataset, batch_size=B, shuffle=False, collate_fn=lambda batch: collate_fn(batch, R, K)) # removed this line

    model.eval()  # Set the model to evaluation mode

    total_pos_sim = 0
    total_neg_sim = 0
    pos_count = 0
    neg_count = 0

    with torch.no_grad():  # Disable gradient computation for validation
        for batch in test_loader:
            word_id = batch['word_id']
            positive_context_ids = batch['positive_context_ids']
            negative_context_ids = batch['negative_context_ids']

            # Ensure word indices are within the valid vocabulary range
            word_id = word_id.clamp(0, model.word_embeddings.num_embeddings - 1) # Clip to the range [0, vocab_size-1]
            positive_context_ids = positive_context_ids.clamp(0, model.context_embeddings.num_embeddings - 1)
            negative_context_ids = negative_context_ids.clamp(0, model.context_embeddings.num_embeddings - 1)

            # Get embeddings for words, positive contexts, and negative contexts
            word_embeddings = model.word_embeddings(word_id)
            positive_embeddings = model.context_embeddings(positive_context_ids)
            negative_embeddings = model.context_embeddings(negative_context_ids)

            # Cosine similarity for positive contexts
            pos_similarity = F.cosine_similarity(word_embeddings.unsqueeze(1), positive_embeddings, dim=-1)
            total_pos_sim += pos_similarity.sum().item()
            pos_count += pos_similarity.numel()

            # Cosine similarity for negative contexts
            neg_similarity = F.cosine_similarity(word_embeddings.unsqueeze(1), negative_embeddings, dim=-1)
            total_neg_sim += neg_similarity.sum().item()
            neg_count += neg_similarity.numel()

    # Average cosine similarities
    avg_pos_sim = total_pos_sim / pos_count
    avg_neg_sim = total_neg_sim / neg_count

    print(f"Avg Positive Context Similarity: {avg_pos_sim:.4f}")
    print(f"Avg Negative Context Similarity: {avg_neg_sim:.4f}")

    return avg_pos_sim, avg_neg_sim

In [58]:
avg_pos_sim, avg_neg_sim = validate_word2vec(model, valid_loader, R, K, batch_size)


Avg Positive Context Similarity: 0.0006
Avg Negative Context Similarity: 0.0003


# Q12

In [59]:
 d = n_samples
 torch.save(model.state_dict(), f"model_dim-{d}_radius-{R}_ratio-{K}_batch-{batch_size}_epoch-{epochs}.ckpt")