# goal: Implement sentiment classifier using transformer architecture

In [1]:
# Import your pytorch convolution tools
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

# load gensim google vectors
import gensim.downloader as api
word_vectors = api.load('word2vec-google-news-300')



In [2]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
device = get_default_device()
print(device)


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

cuda


# data download

Download data from [IMDB Dataset of 50K Movie Reviews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

# preprocessing

In [3]:
from tqdm import tqdm
# data loading
from nltk.tokenize import TreebankWordTokenizer


# # load gensim google vectors
# word_vectors = api.load('word2vec-google-news-300')


def preprocess_data(filepath):
    """
    load data from file. convert labels from string to numbers
    """
    dataset = pd.read_csv(filepath,skiprows=0)
    # modify  dataset[1] such that positive = 1, negative=0
    dataset["sentiment"] = dataset["sentiment"].map({"positive": 1, "negative": 0})
    return dataset


def tokenize_and_vectorize_sample(sample, max_len=400, embedding_dims=300):
    """
    takes text as input and return word vectors as output
    """
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    tokens = tokenizer.tokenize(sample)
    sample_vecs = []
    zero_vector = []
    for _ in range(embedding_dims):
        zero_vector.append(0.0)
    
    for token in tokens:
        try:
            sample_vecs.append(word_vectors[token])
            if len(sample_vecs)>= max_len:
                return sample_vecs
            # print(f"keeping: {token}")
        except KeyError:
            # print(f"skipping: {token}")
            pass  # No matching token in the Google w2v vocab

    additional_elems = max((max_len - len(sample_vecs)),0)
    # print(f"max(({max_len} - {len(sample_vecs)}),0):{additional_elems}")
    for _ in range(additional_elems):
        sample_vecs.append(zero_vector)
    return sample_vecs


dataset = preprocess_data("data/IMDB_Dataset.csv")


# test train split

In [4]:

split_point = int(len(dataset)*.8)

x_train = [sample[0] for i, sample in dataset.iloc[1:split_point,:].iterrows()]
y_train = [sample[1] for i, sample in dataset.iloc[1:split_point,:].iterrows()]

x_test = [sample[0] for i, sample in dataset.iloc[split_point:,:].iterrows()]
y_test = [sample[1] for i, sample in dataset.iloc[split_point:,:].iterrows()]


In [5]:
# tokenize_and_vectorize_sample(x_train[0])
print(len(x_train[0].split()))

162


In [6]:
def generate_batch(x_train, y_train, batch_size):
    next_x_batch, next_y_batch = [], []
    with tqdm(total=len(x_train), position=0, leave=True) as pbar:
        for ip, output in zip(x_train, y_train):
            next_x_batch.append(ip)
            next_y_batch.append(output)
            if len(next_x_batch) == batch_size:
                yield next_x_batch, next_y_batch
                next_x_batch, next_y_batch = [], []
                pbar.update(batch_size)

In [7]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# from torch.utils.data import DataLoader

# Define the PositionalEmbedding layer
class PositionalEmbedding(nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super(PositionalEmbedding, self).__init__()
        self.positional_encoding = nn.Embedding(sequence_length, embed_dim)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = x + self.positional_encoding(positions)
        return x

# Define the TransformerEncoder layer
class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads):
        super(TransformerEncoder, self).__init__()
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, dense_dim),
            nn.ReLU(),
            nn.Linear(dense_dim, embed_dim)
        )

    def forward(self, x):
        attn_output, _ = self.self_attn(x, x, x)
        x = x + attn_output
        x = self.feed_forward(x)
        return x

# Define the model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, sequence_length, embed_dim, dense_dim, num_heads):
        super(TransformerModel, self).__init__()
        self.embedding = PositionalEmbedding(sequence_length, vocab_size, embed_dim)
        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)
        self.pooling = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(embed_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x,padding_mask=True):
        # Apply padding mask
        if padding_mask is not None:
            # print(f"before: x.shape: {x.shape}")
            idx = (x != 0.).all(2).all(0)
            x = x[:, idx]
            # print(f"after: x.shape: {x.shape}")
            # x = x.masked_fill(padding_mask.unsqueeze(1).unsqueeze(2), 0.0)
            
        
        x = self.embedding(x)
        x = self.encoder(x)
        x = x.permute(0, 2, 1)
        x = self.pooling(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x



In [18]:
# Create the model
vocab_size = 20000
sequence_length = 400
embed_dim = 300
num_heads = 2
dense_dim = 64
batch_size=1

t_model = TransformerModel(vocab_size, sequence_length, embed_dim, dense_dim, num_heads)
t_model = to_device(t_model, device)
# Define loss and optimizer
criterion = nn.BCELoss()
# optimizer = optim.RMSprop(t_model.parameters())
optimizer = optim.Adam(t_model.parameters())

# Print model summary (not as detailed as Keras)
print(t_model)


TransformerModel(
  (embedding): PositionalEmbedding(
    (positional_encoding): Embedding(400, 300)
  )
  (encoder): TransformerEncoder(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
    )
    (feed_forward): Sequential(
      (0): Linear(in_features=300, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=300, bias=True)
    )
  )
  (pooling): AdaptiveMaxPool1d(output_size=1)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [None]:
# Training loop
epochs = 10
# for epoch in range(epochs):
#     model.train()
#     for data, target in int_train_ds:
#         optimizer.zero_grad()
#         output = model(data)
#         loss = criterion(output, target)
#         loss.backward()
#         optimizer.step()


# Training loop
num_epochs = epochs  # Example value for epochs
for epoch in tqdm(range(num_epochs)):
    t_model.train()     
    loss_val = 0
    for i, (x_batch, y_batch) in enumerate(generate_batch(x_train, y_train, batch_size=batch_size)):
        # print(i)
        x_batch = [tokenize_and_vectorize_sample(sample, max_len=sequence_length) for sample in x_batch]
        # print(f"len(x_batch: {len(x_batch)}")
        # print(f"[len(x) for x in x_batch]: {[len(x) for x in x_batch]}")
        # print(f"len(x_batch[0][0]: {len(x_batch[0][0])}")
        x_batch = Variable(torch.FloatTensor(x_batch))
        x_batch = to_device(x_batch, device)
        # x_batch = x_batch.permute(0, 2, 1)
        # print(x_batch.shape)
        y_batch = to_device(Variable(torch.FloatTensor([y_batch])), device)
        y_batch = y_batch.reshape(batch_size,1)
        outputs = t_model(x_batch)
        # print(outputs)
        # print(y_batch)
        # print(f"outputs[:5]: {outputs[:5]}")
        # print(f"y_batch[:5]: {y_batch[:5]}")
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        loss_val += loss.item()
        optimizer.step()
        # if i==200:
        #     break
    print(f"epoch({epoch}): total_loss={loss_val}")
    loss_val=0


100%|███████████████████████████████████████████████████| 39999/39999 [04:32<00:00, 146.56it/s]
 10%|█████▊                                                    | 1/10 [04:32<40:56, 272.92s/it]

epoch(0): total_loss=1223987.9144337692


100%|███████████████████████████████████████████████████| 39999/39999 [04:27<00:00, 149.47it/s]
 20%|███████████▌                                              | 2/10 [09:00<35:58, 269.80s/it]

epoch(1): total_loss=2000700.0


100%|███████████████████████████████████████████████████| 39999/39999 [04:24<00:00, 151.07it/s]
 30%|█████████████████▍                                        | 3/10 [13:25<31:12, 267.50s/it]

epoch(2): total_loss=2000700.0


100%|███████████████████████████████████████████████████| 39999/39999 [04:26<00:00, 149.99it/s]
 40%|███████████████████████▏                                  | 4/10 [17:51<26:43, 267.18s/it]

epoch(3): total_loss=2000700.0


100%|███████████████████████████████████████████████████| 39999/39999 [04:24<00:00, 151.14it/s]
 50%|█████████████████████████████                             | 5/10 [22:16<22:11, 266.27s/it]

epoch(4): total_loss=2000700.0


100%|███████████████████████████████████████████████████| 39999/39999 [04:22<00:00, 152.23it/s]
 60%|██████████████████████████████████▊                       | 6/10 [26:39<17:40, 265.07s/it]

epoch(5): total_loss=2000700.0


100%|███████████████████████████████████████████████████| 39999/39999 [04:24<00:00, 151.01it/s]
 70%|████████████████████████████████████████▌                 | 7/10 [31:04<13:15, 265.01s/it]

epoch(6): total_loss=2000700.0


100%|███████████████████████████████████████████████████| 39999/39999 [04:24<00:00, 151.18it/s]
 80%|██████████████████████████████████████████████▍           | 8/10 [35:28<08:49, 264.87s/it]

epoch(7): total_loss=2000700.0


 34%|█████████████████▏                                 | 13476/39999 [01:31<03:05, 143.37it/s]

In [None]:
# Save the model
model_name = f"transformer_{vocab_size}_{sequence_length}_{embed_dim}_{dense_dim}_{num_heads}.pth"
torch.save(t_model.state_dict(), model_name)

# Load the model
loaded_model = TransformerModel(vocab_size, sequence_length, embed_dim, dense_dim, num_heads)
loaded_model = to_device(loaded_model, device)
loaded_model.load_state_dict(torch.load(model_name))
loaded_model.eval()




In [None]:
# evaluation
import torch
from sklearn.metrics import accuracy_score

# Load the model and prepare input data (as shown in the previous responses)

def evaluate(x_test, y_test, batch_size=1):
    print(f"len(x_test) == len(y_test): {len(x_test)} == {len(y_test)}")
    predictions = []
    batches = int(len(x_test)/batch_size) +1
    for i in tqdm(range(batches)):
        x_batch = x_test[i:i+batch_size]
        y_batch = y_test[i:i+batch_size]
        
        if not x_batch or not y_batch:
            break
        
        x_batch = [tokenize_and_vectorize_sample(sample, max_len=sequence_length) for sample in x_batch]
        
        x_batch = Variable(torch.FloatTensor(x_batch))
        x_batch = to_device(x_batch, device)
        
        
        y_batch = to_device(Variable(torch.FloatTensor([y_batch])), device)
        y_batch = y_batch.reshape(batch_size,1)
        y_batch = to_device(y_batch, device)
        # print(x_batch.shape)
        # Perform inference on the test data
        with torch.no_grad():
            # Forward pass to get predictions
            batch_predictions = loaded_model(x_batch)
            # print(f"batch_predictions: {batch_predictions}")
            # Assuming 'predictions' is the model's predictions (binary values)
            # print(predictions)
            # Convert predictions to binary values based on a threshold (e.g., 0.5 for binary classification)
            threshold = 0.5
            binary_predictions = (batch_predictions > threshold).float()
            binary_predictions = [bp.squeeze(0).cpu() for bp in binary_predictions]
            # print(f"binary_predictions.squeeze(): {binary_predictions.squeeze()}")
            predictions.extend(binary_predictions)
            
            # print(f"len(binary_predictions):{len(binary_predictions)}")
    
    total = min(len(y_test), len(predictions))
    # print(y_test[:total])
    # print(predictions[:total])
    accuracy = accuracy_score(y_test[:total], predictions[:total])
    
    print("Accuracy:", accuracy)

evaluate(x_train, y_train)
# print(type(x_test))

# evaluation

# inference