# goal: Implement sentiment classifier using transformer architecture

In [1]:
# Import your pytorch convolution tools
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

# load gensim google vectors
# import gensim.downloader as api
# word_vectors = api.load('word2vec-google-news-300')



In [2]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')


device = get_default_device()
print(device)


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

cuda


# data download

Download data from [IMDB Dataset of 50K Movie Reviews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

# preprocessing

In [3]:
import re
from tqdm import tqdm
# data loading
from nltk.tokenize import TreebankWordTokenizer


# # load gensim google vectors
# word_vectors = api.load('word2vec-google-news-300')


def preprocess_data(filepath):
    """
    load data from file. convert labels from string to numbers
    """
    dataset = pd.read_csv(filepath, skiprows=0)
    # modify  dataset[1] such that positive = 1, negative=0
    dataset["sentiment"] = dataset["sentiment"].map({"positive": 1, "negative": 0})
    return dataset


def clean_and_tokenize_sample(text):
    """
    takes text as input and return tokens as output
    """
    # clean text
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # lower case
    text = text.lower()
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens
    # remove punctuations


dataset = preprocess_data("data/IMDB_Dataset.csv")


In [4]:
from collections import defaultdict

# vocabulary management
samples = [sample[0] for i, sample in dataset.iloc[::].iterrows()]
# print(samples[0])
word_count = defaultdict(int)
word2idx = {}
idx2word = {}
# loop over samples to create vocabulary of size 20000
for sample in tqdm(samples):
    for word in clean_and_tokenize_sample(sample):
        word_count[word] += 1
# sort word_count by value
sorted_word_count = sorted(word_count.items(), key=lambda kv: kv[1], reverse=True)
print(type(sorted_word_count))




100%|██████████████████████████████████████████████████| 50000/50000 [00:18<00:00, 2633.69it/s]

<class 'list'>





In [26]:
# create word2idx and idx2word
word2idx['<PAD>'] = 0
idx2word[0] = '<PAD>'
word2idx['[UNK]'] = 1
idx2word[1] = '[UNK]'

for idx, (word, count) in enumerate(sorted_word_count[:20000], start=2):
    word2idx[word] = idx
    idx2word[idx] = word

In [27]:
def vectorize_sample(text, seq_length):
    # tokenize text
    tokens = clean_and_tokenize_sample(text)
    token_idx = [word2idx.get(token, 1) for token in tokens]
    padding_numbers = [0] * (seq_length - len(token_idx))
    token_idx = token_idx + padding_numbers
    return token_idx[:seq_length]

# vectorize_sample(samples[0], seq_length=400)

# test train split

In [28]:

split_point = int(len(dataset) * .8)

x_train = [sample[0] for i, sample in dataset.iloc[1:split_point, :].iterrows()]
y_train = [sample[1] for i, sample in dataset.iloc[1:split_point, :].iterrows()]

x_test = [sample[0] for i, sample in dataset.iloc[split_point:, :].iterrows()]
y_test = [sample[1] for i, sample in dataset.iloc[split_point:, :].iterrows()]




In [29]:
# tokenize_and_vectorize_sample(x_train[0])
print(len(y_train))

39999


In [43]:
def generate_batch(x_train, y_train, batch_size):
    next_x_batch, next_y_batch = [], []
    with tqdm(total=len(x_train), position=0, leave=True) as pbar:
        for ip, output in zip(x_train, y_train):
            next_x_batch.append(ip)
            next_y_batch.append(output)
            if len(next_x_batch) == batch_size:
                yield next_x_batch, next_y_batch
                next_x_batch, next_y_batch = [], []
                pbar.update(batch_size)

In [75]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# from torch.utils.data import DataLoader

# Define the PositionalEmbedding layer
class PositionalEmbedding(nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super(PositionalEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Embedding(sequence_length, embed_dim)

    def forward(self, x):
        x = self.embedding(x)
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = x + self.positional_encoding(positions)
        return x


# Define the TransformerEncoder layer
class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads):
        super(TransformerEncoder, self).__init__()
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, dense_dim),
            nn.ReLU(),
            nn.Linear(dense_dim, dense_dim)
        )

    def forward(self, x):
        attn_output, _ = self.self_attn(x, x, x)
        x = x + attn_output
        x = self.feed_forward(x)
        return x


# Define the model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, dense_dim, num_heads):
        super(TransformerModel, self).__init__()
        self.embedding = PositionalEmbedding(sequence_length, vocab_size, embed_dim)
        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)
        self.pooling = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(dense_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, padding_mask=True):
        # Apply padding mask
        # print(x)
        if padding_mask is not None:
            # print(f"before: x.shape: {x.shape}")
            # remove 0 padded values from tensor x
            x= x[:, x.sum(axis=0) != 0]
            # print(f"after: x.shape: {x.shape}")
            # x = x.masked_fill(padding_mask.unsqueeze(1).unsqueeze(2), 0.0)

        x = self.embedding(x)
        x = self.encoder(x)
        x = x.permute(0, 2, 1)
        x = self.pooling(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x



In [96]:
# Create the model
vocab_size = 20000
sequence_length = 400
embed_dim = 256
num_heads = 2
dense_dim = 64
batch_size = 100

t_model = TransformerModel(vocab_size, sequence_length, embed_dim, dense_dim, num_heads)
t_model = to_device(t_model, device)
# Define loss and optimizer
criterion = nn.BCELoss()
# optimizer = optim.RMSprop(t_model.parameters())
optimizer = optim.Adam(t_model.parameters())

# Print model summary (not as detailed as Keras)
print(t_model)


TransformerModel(
  (embedding): PositionalEmbedding(
    (embedding): Embedding(20000, 256)
    (positional_encoding): Embedding(400, 256)
  )
  (encoder): TransformerEncoder(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (feed_forward): Sequential(
      (0): Linear(in_features=256, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
    )
  )
  (pooling): AdaptiveMaxPool1d(output_size=1)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [97]:
# Training loop
epochs = 25
# for epoch in range(epochs):
#     model.train()
#     for data, target in int_train_ds:
#         optimizer.zero_grad()
#         output = model(data)
#         loss = criterion(output, target)
#         loss.backward()
#         optimizer.step()


# Training loop
num_epochs = epochs  # Example value for epochs
for epoch in tqdm(range(num_epochs)):
    t_model.train()
    loss_val = 0
    for i, (x_batch, y_batch) in enumerate(generate_batch(x_train, y_train, batch_size=batch_size)):
        # print(i)
        x_batch = [vectorize_sample(sample, seq_length=sequence_length) for sample in x_batch]
        # print(f"len(x_batch: {len(x_batch)}")
        # print(f"[len(x) for x in x_batch]: {[len(x) for x in x_batch]}")
        # print(f"len(x_batch[0][0]: {len(x_batch[0][0])}")
        x_batch = Variable(torch.LongTensor(x_batch))
        x_batch = to_device(x_batch, device)
        # x_batch = x_batch.permute(0, 2, 1)
        # print(x_batch.shape)
        y_batch = to_device(Variable(torch.FloatTensor([y_batch])), device)
        y_batch = y_batch.reshape(batch_size, 1)
        outputs = t_model(x_batch)
        # print(outputs)
        # print(y_batch)
        # print(f"outputs[:5]: {outputs[:5]}")
        # print(f"y_batch[:5]: {y_batch[:5]}")
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        loss_val += loss.item()
        optimizer.step()
        # if i==200:
        #     break
    print(f"epoch({epoch}): total_loss={loss_val}")
    loss_val = 0


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:53<00:00, 742.13it/s]
  4%|██▎                                                        | 1/25 [00:53<21:30, 53.77s/it]

epoch(0): total_loss=251.29403686523438


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:55<00:00, 715.18it/s]
  8%|████▋                                                      | 2/25 [01:49<21:04, 54.96s/it]

epoch(1): total_loss=208.88239336013794


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:56<00:00, 705.22it/s]
 12%|███████                                                    | 3/25 [02:46<20:25, 55.70s/it]

epoch(2): total_loss=184.27316710352898


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:57<00:00, 695.26it/s]
 16%|█████████▍                                                 | 4/25 [03:43<19:43, 56.37s/it]

epoch(3): total_loss=166.67442199587822


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:57<00:00, 695.51it/s]
 20%|███████████▊                                               | 5/25 [04:40<18:54, 56.73s/it]

epoch(4): total_loss=155.39639689028263


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:57<00:00, 690.40it/s]
 24%|██████████████▏                                            | 6/25 [05:38<18:04, 57.09s/it]

epoch(5): total_loss=145.0745263695717


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:57<00:00, 692.44it/s]
 28%|████████████████▌                                          | 7/25 [06:36<17:10, 57.27s/it]

epoch(6): total_loss=134.1128852814436


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:57<00:00, 692.22it/s]
 32%|██████████████████▉                                        | 8/25 [07:33<16:15, 57.39s/it]

epoch(7): total_loss=124.08535158634186


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:57<00:00, 691.55it/s]
 36%|█████████████████████▏                                     | 9/25 [08:31<15:19, 57.49s/it]

epoch(8): total_loss=118.78488986194134


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:57<00:00, 691.68it/s]
 40%|███████████████████████▏                                  | 10/25 [09:29<14:23, 57.55s/it]

epoch(9): total_loss=108.06754359602928


100%|██████████████████████████████████████████████████▊| 39900/39999 [01:58<00:00, 335.66it/s]
 44%|█████████████████████████▌                                | 11/25 [11:28<17:48, 76.32s/it]

epoch(10): total_loss=99.30397672951221


100%|██████████████████████████████████████████████████▊| 39900/39999 [00:50<00:00, 788.42it/s]
 48%|███████████████████████████▊                              | 12/25 [12:18<14:50, 68.50s/it]

epoch(11): total_loss=99.04220001399517


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:33<00:00, 1185.63it/s]
 52%|██████████████████████████████▏                           | 13/25 [12:52<11:35, 57.94s/it]

epoch(12): total_loss=91.42107579112053


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:33<00:00, 1182.71it/s]
 56%|████████████████████████████████▍                         | 14/25 [13:26<09:16, 50.63s/it]

epoch(13): total_loss=81.95591749250889


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:33<00:00, 1181.92it/s]
 60%|██████████████████████████████████▊                       | 15/25 [14:00<07:35, 45.55s/it]

epoch(14): total_loss=74.065638191998


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:33<00:00, 1179.37it/s]
 64%|█████████████████████████████████████                     | 16/25 [14:33<06:18, 42.02s/it]

epoch(15): total_loss=69.23877700045705


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:33<00:00, 1176.15it/s]
 68%|███████████████████████████████████████▍                  | 17/25 [15:07<05:16, 39.59s/it]

epoch(16): total_loss=68.51140880957246


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1168.02it/s]
 72%|█████████████████████████████████████████▊                | 18/25 [15:41<04:25, 37.96s/it]

epoch(17): total_loss=66.5126073770225


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1172.53it/s]
 76%|████████████████████████████████████████████              | 19/25 [16:15<03:40, 36.78s/it]

epoch(18): total_loss=61.380131382495165


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1172.02it/s]
 80%|██████████████████████████████████████████████▍           | 20/25 [16:50<02:59, 35.96s/it]

epoch(19): total_loss=59.749193392693996


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1166.24it/s]
 84%|████████████████████████████████████████████████▋         | 21/25 [17:24<02:21, 35.44s/it]

epoch(20): total_loss=58.772314205765724


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1161.81it/s]
 88%|███████████████████████████████████████████████████       | 22/25 [17:58<01:45, 35.11s/it]

epoch(21): total_loss=53.74704580195248


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1161.34it/s]
 92%|█████████████████████████████████████████████████████▎    | 23/25 [18:32<01:09, 34.89s/it]

epoch(22): total_loss=52.435563907027245


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1156.44it/s]
 96%|███████████████████████████████████████████████████████▋  | 24/25 [19:07<00:34, 34.77s/it]

epoch(23): total_loss=58.75098267570138


100%|█████████████████████████████████████████████████▉| 39900/39999 [00:34<00:00, 1156.78it/s]
100%|██████████████████████████████████████████████████████████| 25/25 [19:41<00:00, 47.28s/it]

epoch(24): total_loss=50.378406984731555





In [98]:
# Save the model
model_name = f"transformer_{vocab_size}_{sequence_length}_{embed_dim}_{dense_dim}_{num_heads}.pth"
torch.save(t_model.state_dict(), model_name)




In [99]:
# Load the model
model_name = f"transformer_{vocab_size}_{sequence_length}_{embed_dim}_{dense_dim}_{num_heads}.pth"
loaded_model = TransformerModel(vocab_size, sequence_length, embed_dim, dense_dim, num_heads)
loaded_model = to_device(loaded_model, device)
loaded_model.load_state_dict(torch.load(model_name))
loaded_model.eval()



TransformerModel(
  (embedding): PositionalEmbedding(
    (embedding): Embedding(20000, 256)
    (positional_encoding): Embedding(400, 256)
  )
  (encoder): TransformerEncoder(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (feed_forward): Sequential(
      (0): Linear(in_features=256, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
    )
  )
  (pooling): AdaptiveMaxPool1d(output_size=1)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [100]:
# evaluation
import torch
from sklearn.metrics import accuracy_score


# Load the model and prepare input data (as shown in the previous responses)

def evaluate(x_test, y_test, batch_size=1):
    print(f"len(x_test) == len(y_test): {len(x_test)} == {len(y_test)}")
    predictions = []
    
    batches = int(len(x_test) / batch_size) + 1
    for i in tqdm(range(batches)):
        x_batch = x_test[i:i + batch_size]
        y_batch = y_test[i:i + batch_size]

        if not x_batch or not y_batch:
            break

        x_batch = [vectorize_sample(sample, seq_length=sequence_length) for sample in x_batch]

        x_batch = Variable(torch.LongTensor(x_batch))
        x_batch = to_device(x_batch, device)

        y_batch = to_device(Variable(torch.FloatTensor([y_batch])), device)
        y_batch = y_batch.reshape(batch_size, 1)
        y_batch = to_device(y_batch, device)
        # print(x_batch.shape)
        # Perform inference on the test data
        with torch.no_grad():
            # Forward pass to get predictions
            batch_predictions = loaded_model(x_batch)
            # Assuming 'predictions' is the model's predictions (binary values)
            # print(predictions)
            # Convert predictions to binary values based on a threshold (e.g., 0.5 for binary classification)
            print(f"batch_predictions: {batch_predictions}")
            
            threshold = 0.5
            binary_predictions = (batch_predictions > threshold).float()
            
            # print(f"binary_predictions: {binary_predictions}")
            binary_predictions = [bp.squeeze(0).cpu() for bp in binary_predictions]
            # print(f"binary_predictions: {binary_predictions}")
            predictions.extend(binary_predictions)

            # print(f"len(binary_predictions):{len(binary_predictions)}")

    total = min(len(y_test), len(predictions))
    # print(y_test[:total])
    # print(predictions[:total])
    accuracy = accuracy_score(y_test[:total], predictions[:total])

    print("Accuracy:", accuracy)


# evaluate(x_test, y_test)
x_positive_train = [x for i,x in enumerate(x_train) if y_train[i]==1]
y_positive_train = [y for y in y_train if y==1]

x_positive_test = [x for i,x in enumerate(x_test) if y_train[i]==1]
y_positive_test = [y for y in y_test if y==1]
# evaluate(x_train, y_train)
evaluate(x_positive_test[:100], y_positive_test[:100])
# print(len([y for y in y_test if y==1]))
# y_test

len(x_test) == len(y_test): 100 == 100


  0%|                                                                  | 0/101 [00:00<?, ?it/s]

batch_predictions: tensor([[0.0505]], device='cuda:0')
batch_predictions: tensor([[0.1124]], device='cuda:0')
batch_predictions: tensor([[0.1421]], device='cuda:0')
batch_predictions: tensor([[0.5070]], device='cuda:0')
batch_predictions: tensor([[0.1407]], device='cuda:0')
batch_predictions: tensor([[0.2843]], device='cuda:0')
batch_predictions: tensor([[0.5601]], device='cuda:0')
batch_predictions: tensor([[0.2724]], device='cuda:0')
batch_predictions: tensor([[0.0071]], device='cuda:0')
batch_predictions: tensor([[0.1412]], device='cuda:0')
batch_predictions: tensor([[0.0731]], device='cuda:0')
batch_predictions: tensor([[0.2521]], device='cuda:0')
batch_predictions: tensor([[0.1214]], device='cuda:0')
batch_predictions: tensor([[0.0571]], device='cuda:0')
batch_predictions: tensor([[0.0105]], device='cuda:0')
batch_predictions: tensor([[0.2335]], device='cuda:0')
batch_predictions: tensor([[0.0139]], device='cuda:0')
batch_predictions: tensor([[0.0680]], device='cuda:0')
batch_pred

 38%|█████████████████████                                   | 38/101 [00:00<00:00, 379.06it/s]

batch_predictions: tensor([[0.0090]], device='cuda:0')
batch_predictions: tensor([[0.3203]], device='cuda:0')
batch_predictions: tensor([[0.0616]], device='cuda:0')
batch_predictions: tensor([[0.2738]], device='cuda:0')
batch_predictions: tensor([[0.0689]], device='cuda:0')
batch_predictions: tensor([[0.0510]], device='cuda:0')
batch_predictions: tensor([[0.0119]], device='cuda:0')
batch_predictions: tensor([[0.0822]], device='cuda:0')
batch_predictions: tensor([[0.1129]], device='cuda:0')
batch_predictions: tensor([[0.0128]], device='cuda:0')
batch_predictions: tensor([[0.1963]], device='cuda:0')
batch_predictions: tensor([[0.1965]], device='cuda:0')
batch_predictions: tensor([[0.0694]], device='cuda:0')
batch_predictions: tensor([[0.0975]], device='cuda:0')
batch_predictions: tensor([[0.0102]], device='cuda:0')
batch_predictions: tensor([[0.0261]], device='cuda:0')
batch_predictions: tensor([[0.6683]], device='cuda:0')
batch_predictions: tensor([[0.0019]], device='cuda:0')
batch_pred

 77%|███████████████████████████████████████████▏            | 78/101 [00:00<00:00, 388.31it/s]

batch_predictions: tensor([[0.0712]], device='cuda:0')


 99%|██████████████████████████████████████████████████████▍| 100/101 [00:00<00:00, 387.19it/s]

batch_predictions: tensor([[0.5114]], device='cuda:0')
batch_predictions: tensor([[0.1105]], device='cuda:0')
batch_predictions: tensor([[0.0154]], device='cuda:0')
batch_predictions: tensor([[0.7924]], device='cuda:0')
batch_predictions: tensor([[0.1584]], device='cuda:0')
batch_predictions: tensor([[0.0186]], device='cuda:0')
batch_predictions: tensor([[0.1243]], device='cuda:0')
batch_predictions: tensor([[0.0027]], device='cuda:0')
batch_predictions: tensor([[0.2192]], device='cuda:0')
batch_predictions: tensor([[0.8620]], device='cuda:0')
batch_predictions: tensor([[0.0695]], device='cuda:0')
batch_predictions: tensor([[0.0456]], device='cuda:0')
batch_predictions: tensor([[0.0346]], device='cuda:0')
batch_predictions: tensor([[0.0198]], device='cuda:0')
batch_predictions: tensor([[0.0594]], device='cuda:0')
batch_predictions: tensor([[0.8122]], device='cuda:0')
batch_predictions: tensor([[0.0459]], device='cuda:0')
batch_predictions: tensor([[0.0576]], device='cuda:0')
batch_pred




# evaluation

In [101]:
# inference