###   imdb_dense_torch_001.ipynb

1. Run under a PyTorch virtual env

2. Get the imdb_data from the files created in imdb_dense_torch_0000.ipynb

3. Modify the data structures as necessary

4. Use the data to test a PyTorch Dense model with parameters similar to the ones we used under TF/Keras

5. Compare results vis-a vis the results using TF Dense Models 





In [1]:
import numpy as np
import pickle

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


In [2]:
# Mimic the IMDB Data, which is typically set as list of (review, label) tuples.
# The reviews are lists of word indices, from 1...10000, going from more important to less important
# The labels are 0 or 1, indicating a bad or good review, respectively.

# I created the data manually, to ensure that the data preprocessing, the models etc, work well.
# There are only three different lists in this data to ensure that models can learn and predict
# with high accuracy. Results with low accuracy will reveal problems in the data pre-processing,
# the models configurations, or both...

imdb_fake_data = [
    ([1, 2, 3, 4, 5], 1),  # Review: [word1, word2, ...], Label: 1
    ([6, 7, 8], 0),        # Review: [word1, word2, ...], Label: 0
    ([1, 2, 3, 4, 5], 1),
    ([1, 2, 3, 4, 5], 1),
    ([1, 2, 3, 4, 5], 1),
    ([1, 2, 3, 4, 5], 1),
    ([1, 2, 3, 4, 5], 1),
    ([1, 2, 3, 4, 5], 1),
    ([6, 7, 8], 0),
    ([6, 7, 8], 0),
    ([6, 7, 8], 0),
    ([6, 7, 8], 0),
    ([6, 7, 8], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    ([15, 16], 0),
    # ... more data
]


In [3]:
# Use pickle to write/read imdb_fake data to/from disk
# import pickle

# Save to disk
with open("./imdb_fake_data.pkl", "wb") as f:
   pickle.dump(imdb_fake_data, f)

# Load from disk
with open("./imdb_fake_data.pkl", "rb") as f:
   imdb_fake_ragged_data = pickle.load(f)

print("Orig fake data")
print(imdb_fake_data)
print("fake data from disk")
print(imdb_fake_ragged_data)   


#-----------------------------------------------------------------
with open("./ragged_array.pkl", "rb") as f:
   tf_ragged_data = pickle.load(f)

print("TF ragged data from disk")
print(tf_ragged_data)

print("lofas" )
for i in tf_ragged_data :
   print( i )


with open("./train_data_as_ragged_array.pkl", "rb") as f:
   tf_train_data_as_ragged_array = pickle.load(f)

with open("./train_labels_as_ragged_array.pkl", "rb") as f:
   tf_train_labels_as_ragged_array = pickle.load(f)




Orig fake data
[([1, 2, 3, 4, 5], 1), ([6, 7, 8], 0), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([6, 7, 8], 0), ([6, 7, 8], 0), ([6, 7, 8], 0), ([6, 7, 8], 0), ([6, 7, 8], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0)]
fake data from disk
[([1, 2, 3, 4, 5], 1), ([6, 7, 8], 0), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([1, 2, 3, 4, 5], 1), ([6, 7, 8], 0), ([6, 7, 8], 0), ([6, 7, 8], 0), ([6, 7, 8], 0), ([6, 7, 8], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0), ([15, 16], 0)]
TF ragged data from disk
[list([1, 14, 22, 16, 43, 530, 32])
 list([1, 194, 1153, 194, 8255, 78, 95])
 list([1, 14, 47, 8, 30, 31, 7, 4, 249, 7, 129, 13])]
lofas
[1, 14, 22, 16, 43, 530, 32]
[1

In [4]:
class ReviewData:  # Custom class to control printing
    def __init__(self, data):
        self.data = np.array(data, dtype=object)

    def __str__(self):  # Override the string representation
        return "[" + " ".join(str(tuple(item)) for item in self.data) + "]"

    def __getitem__(self, index):  # Allow indexing
        return self.data[index]
    
    

In [5]:
data1=ReviewData(imdb_fake_data)
print(data1.__getitem__(1))

[list([6, 7, 8]) 0]


In [6]:

tf_train_data  = tf_train_data_as_ragged_array
tf_train_labels = tf_train_labels_as_ragged_array 
tf_train_labels = tf_train_labels.astype(object)

nRows = 25000
nCols = 2
imdb_pt_data = np.empty(0, dtype=object)
imdb_pt_data.resize(nRows)

# note: since the revies are long, placed the labels first when I was verifying that the data is being set up correctly
for i in range(nRows):
  d = tf_train_data[i]
  l = tf_train_labels[i]
  #print("label[",i, "] = ", l)
  #print("data [",i, "] = ", d )
  ll = (d,l)
  imdb_pt_data[i] = ll
  #print("ll = ", ll )
  #print("")


for i in range(3):
  print("i")
  print( imdb_pt_data[i])



i
([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32], 1)
i
([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 436

In [8]:
# 2. Custom Dataset Class
class IMDBDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review, label = self.data[idx]
        return torch.tensor(review), torch.tensor(label)  # Convert to tensors

In [7]:
# 3. Data Loading with Padding
def collate_fn(batch):
    reviews, labels = zip(*batch)
    review_lengths = torch.tensor([len(r) for r in reviews])  # Store original lengths
    padded_reviews = pad_sequence(reviews, batch_first=True, padding_value=0) # Pad to max length in batch
    labels = torch.stack(labels) # Stack labels
    return padded_reviews, labels, review_lengths

In [10]:
# I am using data from imdb_pt_data. Note that the array has a few data poits (25) because 
# I am testing if the data transformation from tf to pt was done OK

# Split training and test data. Training gets 80% of the data

# 

# train_data, test_data = train_test_split(imdb_fake_data, test_size=0.2, random_state=42)
train_data, test_data = train_test_split(imdb_pt_data, test_size=0.2, random_state=42)

train_dataset = IMDBDataset(train_data)
test_dataset = IMDBDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [29]:
 #4. Define a Pytorch Dense Model

class DenseModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(DenseModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)  # Word embeddings
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # self.f = open("./IMDB_outputs.txt", "a")

    def forward(self, x, lengths):
        
        with open("./x1.pkl", "wb") as f:
            pickle.dump(x, f)
        
        embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        
        with open("./embedded.pkl", "wb") as f:
            pickle.dump(embedded, f)
        # Create mask *before* embedding. 1 for non-padding, 0 for padding
        mask = x != 0  # (batch_size, seq_len) - True for non-padding, False for padding
        masked_embedded = embedded * mask.unsqueeze(-1).float() # Apply mask to embeddings

        # Average pooling over sequence length (handles variable lengths)
        lengths = lengths.unsqueeze(1).float()  # (batch_size, 1)
        pooled = masked_embedded.sum(dim=1) / lengths  # Average pool
        
        
        with open("./pooled.pkl", "wb") as f:
            pickle.dump(pooled, f)
        x = self.fc1(pooled)
        
        
        with open("./linear1.pkl", "wb") as f:
            pickle.dump(x, f)
        x = self.relu(x)
        
        
        with open("./relu.pkl", "wb") as f:
            pickle.dump(x, f)
        x = self.fc2(x)
        
        
        with open("./linear2.pkl", "wb") as f:
            pickle.dump(x, f)
        
        return x

In [30]:
# 5. Training and Validation Loops
input_dim = 10000  # Adjust size of vocabulary to 10000 
embedding_dim = 128  # Size of word embeddings
hidden_dim = 256
output_dim = 2  # Binary classification (0 or 1)


model = DenseModel(input_dim, embedding_dim, hidden_dim, output_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use gpu if available
model.to(device) # move model to gpu

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [31]:
# from torchsummary import summary
# summary(model,(3, 224))
print(model)

DenseModel(
  (embedding): Embedding(10000, 128)
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=256, out_features=2, bias=True)
)


In [32]:
# change number of epochs as necessary with the real IMDB data.
#  Maybe just 10 will be enough 
num_epochs = 1

for epoch in range(num_epochs):
    # Training
    model.train()  # Set model to training mode
    for padded_reviews, labels, lengths in train_loader:
      padded_reviews = padded_reviews.to(device) # move data to gpu
      labels = labels.to(device) # move data to gpu
      lengths = lengths.to(device) # move data to gpu
      optimizer.zero_grad()
      outputs = model(padded_reviews, lengths)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    # Validation
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradients during validation
        for padded_reviews, labels, lengths in test_loader:
          padded_reviews = padded_reviews.to(device) # move data to gpu
          labels = labels.to(device) # move data to gpu
          lengths = lengths.to(device) # move data to gpu
          outputs = model(padded_reviews, lengths)
          _, predicted = torch.max(outputs.data, 1) # get the prediction
          total += labels.size(0)
          correct += (predicted == labels).sum().item() # count correct predictions

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%")


Epoch [1/1], Loss: 0.2731, Accuracy: 85.52%


In [33]:
# Load from disk
with open("./x1.pkl", "rb") as f:
   x1 = pickle.load(f)
print("x1",x1)

with open("./embedded.pkl", "rb") as f:
   embedded = pickle.load(f)
print("embedded",embedded)

with open("./pooled.pkl", "rb") as f:
   pooled = pickle.load(f)
print("pooled",pooled)






x1 tensor([[   1,   65,  517,  ...,    0,    0,    0],
        [   1,  526,   34,  ...,    0,    0,    0],
        [   1,    5, 1977,  ...,    0,    0,    0],
        ...,
        [   1,   14,   22,  ...,    0,    0,    0],
        [   1,   14,   20,  ...,    0,    0,    0],
        [   1,   13,  566,  ...,    0,    0,    0]])
embedded tensor([[[ 1.2617,  0.0112,  1.1848,  ...,  1.4919, -0.4083, -0.6236],
         [-1.4636, -0.1120,  0.3118,  ...,  0.0943, -0.5127,  1.7256],
         [-1.4218,  0.9444, -0.5894,  ..., -1.1257, -1.1973, -1.6173],
         ...,
         [ 1.0168, -0.8572, -0.0812,  ..., -0.1236, -1.3896, -1.8434],
         [ 1.0168, -0.8572, -0.0812,  ..., -0.1236, -1.3896, -1.8434],
         [ 1.0168, -0.8572, -0.0812,  ..., -0.1236, -1.3896, -1.8434]],

        [[ 1.2617,  0.0112,  1.1848,  ...,  1.4919, -0.4083, -0.6236],
         [ 0.1977, -0.9578, -1.0020,  ...,  0.9786, -0.8330,  0.4770],
         [-1.1927,  0.1781, -0.8170,  ..., -0.9214,  1.0342,  0.8680],
       

In [None]:

with open("./linear1.pkl", "rb") as f:
   linear1 = pickle.load(f)
print("linear1",linear1)

with open("./relu.pkl", "rb") as f:
   relu = pickle.load(f)
print("relu",relu)

with open("./linear2.pkl", "rb") as f:
   linear2 = pickle.load(f)
print("linear2",linear2)

relu tensor([[0.1432, 0.1074, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0339, 0.0008, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0827, 0.0917, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2167, 0.0887, 0.0000,  ..., 0.0000, 0.0000, 0.2044],
        [0.0180, 0.0154, 0.0000,  ..., 0.0000, 0.0000, 0.1018]])
linear2 tensor([[-0.7950,  0.7901],
        [-2.6965,  2.8278],
        [-1.0879,  1.1277],
        [-0.1420,  0.1573],
        [ 0.2944, -0.2822],
        [ 0.4024, -0.3651],
        [ 0.7997, -0.8183],
        [-1.1663,  1.2573]])


In [None]:
# 6. Evaluation (After Training) - More Detailed
model.eval()  # Set to evaluation mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use gpu if available
model.to(device) # move model to gpu

correct = 0
total = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for padded_reviews, labels, lengths in test_loader:
        padded_reviews = padded_reviews.to(device) # move data to gpu
        labels = labels.to(device) # move data to gpu
        lengths = lengths.to(device) # move data to gpu
        outputs = model(padded_reviews, lengths)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        all_predictions.extend(predicted.cpu().numpy())  # Store predictions for later analysis
        all_labels.extend(labels.cpu().numpy())      # Store true labels

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

# 7. Additional Evaluation Metrics (using scikit-learn)
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
print(classification_report(all_labels, all_predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_predictions))


# Example of getting predictions for a single review
def predict(review_indices):
    model.eval()
    review_tensor = torch.tensor([review_indices]).to(device)  # Add batch dimension
    review_length = torch.tensor([len(review_indices)]).to(device)
    with torch.no_grad():
        output = model(review_tensor, review_length)
        _, predicted = torch.max(output, 1)
    return predicted.item()
