In [173]:
%%capture
!pip install transformers datasets tabulate

In [174]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import DataLoader
from tabulate import tabulate
from datasets import load_dataset

from tqdm.notebook import tqdm
from transformers import BertTokenizer
torch.cuda.is_available()

True

This is a template of the notebook that you should complete and enrich with your own code.

First cells will be the same than the ones of the lab on text convolution.

# Data loading


In [175]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


In [176]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


In [177]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x


In [178]:
n_samples = 10000  # the number of training example

# We first shuffle the data !
data_shuffled = dataset.shuffle()

# Select 5000 samples
data_shuffled_sampled = data_shuffled.select(range(n_samples))

# Tokenize the dataset
data_tokenized = data_shuffled_sampled.map(lambda x: preprocessing_fn(x, tokenizer))

# Remove useless columns
data_tokenized = data_tokenized.remove_columns(["review", "sentiment"])

# Split the train and validation
split = data_tokenized.train_test_split(test_size=0.2)
train_set = split['train']
valid_set = split["test"]


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [179]:
from torch.utils.data import Dataset

class sentiment(Dataset):
    def __init__(self, review_label_set):
      self.sentences = review_label_set["review_ids"]
      self.labels = review_label_set["label"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx: int):
        return self.sentences[idx], self.labels[idx]

In [180]:
def collate_fn(batch, max_size = 256):
    sentences_padded = []
    labels = []

    for sentence, label in batch:

        sentence_padded = [0]*max_size
        sentence_padded[:len(sentence)] = sentence
        sentences_padded.append(sentence_padded)
        labels.append(label)

    return {
        'sentences_padded': torch.tensor(sentences_padded, dtype=torch.long),
        'labels': torch.tensor(labels, dtype=torch.long),
    }

In [181]:
train_dataset = sentiment(train_set)
valid_dataset = sentiment(valid_set)

In [182]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [183]:
batch = next(iter(train_loader))
batch['sentences_padded'].shape


torch.Size([32, 256])

# Model

In [184]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim,device = 'cuda'):
        super(Word2Vec, self).__init__()
        self.device = device
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim).to(device)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim).to(device)

In [185]:
vocab_size = tokenizer.vocab_size
embedding_dim = 100
word2vec = Word2Vec(vocab_size,embedding_dim)
word2vec.load_state_dict(torch.load("model2_dim-100_radius-10_ratio-5_batch-1024_epoch-10_samples-10000.ckpt",weights_only=True))

<All keys matched successfully>

In [186]:
class CNNTextClassifier(nn.Module):
    def __init__(self,word2vec, vocab_size, embedding_dim, embedding_matrix, num_classes=1):
        super(CNNTextClassifier, self).__init__()
        
        # Embedding layer
        self.embedding = word2vec.word_embeddings
        #self.embedding.weight = nn.Parameter(embedding_matrix)
        #self.embedding.weight.requires_grad = learnable_embedding  # Do not train the embedding layer

        # First convolutional block
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=5)
        self.pool1 = nn.MaxPool1d(kernel_size=2)  # Reduce by half
        
        # Second convolutional block
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5)
        self.pool2 = nn.MaxPool1d(kernel_size=2)  # Reduce by half again
        
        # Third convolutional block (optional for deeper networks)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5)
        self.pool3 = nn.MaxPool1d(kernel_size=2)  # Reduce further
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 29, 10)  # Output after 3 max pools (256 -> 128 -> 64 -> 32 -> 29)
        self.fc2 = nn.Linear(10, num_classes)

    def forward(self, x):
        # Embedding: (batch_size, sequence_length, embedding_dim)
        x = self.embedding(x)
        
        # Permute for Conv1D: (batch_size, embedding_dim, sequence_length)
        x = x.permute(0, 2, 1)
        
        # Convolutional Block 1: Conv1D + MaxPool
        x = torch.relu(self.conv1(x))  # (batch_size, 128, 252)
        x = self.pool1(x)              # (batch_size, 128, 126)
        
        # Convolutional Block 2: Conv1D + MaxPool
        x = torch.relu(self.conv2(x))  # (batch_size, 128, 122)
        x = self.pool2(x)              # (batch_size, 128, 61)
        
        # Convolutional Block 3: Conv1D + MaxPool (optional)
        x = torch.relu(self.conv3(x))  # (batch_size, 128, 57)
        x = self.pool3(x)              # (batch_size, 128, 29)
        
        # Flatten the output for the fully connected layer
        x = x.view(x.size(0), -1)      # (batch_size, 128 * 29)
        
        # Fully connected layers
        x = torch.relu(self.fc1(x))    # (batch_size, 10)
        x = torch.sigmoid(self.fc2(x)) # (batch_size, num_classes)
        
        return x

In [187]:
model = CNNTextClassifier(word2vec,vocab_size, embedding_dim, embedding_matrix)

In [188]:
criterion = nn.BCELoss()  # Binary Cross-Entropy for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}')

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str