In [None]:
import ast
import torch
import collections
import numpy as np
import pandas as pd
import gdown  # For downloading from Google Drive
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x7e6d4cdc1850>

In [None]:
df = pd.read_csv("preprocessed_yelp_data.csv", delimiter=',')
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)
train_df.shape, test_df.shape

((800, 2), (200, 2))

In [None]:
def build_vocabulary(texts:list, max_size:int=10000, min_freq:int=2):
  """
  converts list of list of words store in texts to vocabulary dictionary
  """
  word_count = collections.Counter()
  for text in texts:
      word_count.update(text)
  print("No. of unique words (without any filters): ",len(word_count))
  vocabulary = {
      '<UKN>': 0,
      '<PAD>': 1,
  }
  idx = 2
  for word, count in word_count.items():
    if count >= min_freq and len(vocabulary) <= max_size:
      vocabulary[word] = idx
      idx += 1
  return vocabulary
# conver string format to list of list of words
train_texts = train_df['text'].tolist()
train_texts = [ast.literal_eval(text) for text in train_texts]
test_texts = test_df['text'].tolist()
test_texts = [ast.literal_eval(text) for text in test_texts]
vocab = build_vocabulary(train_texts)
print("len of vocabulary: ", len(vocab))

No. of unique words (without any filters):  1826
len of vocabulary:  762


In [None]:
train_numeric = np.empty((len(train_texts),), dtype=object)
test_numeric = np.empty((len(test_texts),), dtype=object)

def numericalize_text(word_list:list, vocab:dict):
  return np.array([vocab.get(word, 0) for word in word_list])


###################
# Adding one as padding
###################
for i,words in enumerate(train_texts):
  temp_res = np.array(numericalize_text(words, vocab))
  if len(temp_res) < 100:
    train_numeric[i] = np.concatenate((temp_res, np.ones(100-len(temp_res))))
  else:
    train_numeric[i] = temp_res

for i, words in enumerate(test_texts):
  temp_res = np.array(numericalize_text(words, vocab))
  if len(temp_res) < 100:
    test_numeric[i] = np.concatenate((temp_res, np.ones(100-len(temp_res))))
  else:
    test_numeric[i] = temp_res


In [None]:
train_tensor = torch.tensor(np.array([torch.tensor(sub_arr) for sub_arr in train_numeric]))
test_tensor = torch.tensor(np.array([torch.tensor(sub_arr) for sub_arr in test_numeric]))

# trainloader = DataLoader(train_tensor, batch_size = 32, shuffle = True)
# testloader = DataLoader(test_tensor, batch_size = 32, shuffle=False)



In [None]:
torch.tensor(np.array([torch.tensor(sub_arr) for sub_arr in train_numeric]))

tensor([[  2.,   3.,   4.,  ...,   1.,   1.,   1.],
        [  7.,   8.,   9.,  ...,   1.,   1.,   1.],
        [ 12.,  13.,  14.,  ...,   1.,   1.,   1.],
        ...,
        [ 19.,  77., 106.,  ...,   1.,   1.,   1.],
        [ 71.,   4.,  29.,  ...,   1.,   1.,   1.],
        [364., 542., 147.,  ...,   1.,   1.,   1.]], dtype=torch.float64)

In [None]:
import torch
import torch.nn as nn

# Define the embedding layer with a vocabulary size of 10,000 and embedding dimension of 100
embedding_layer = nn.Embedding(101, 100)

# Example input: a sequence of word indices
word_indices = torch.tensor([1, 4, 2, 8, 99, 100])  # Example indices

# Get the corresponding embeddings for these indices
embeddings = embedding_layer(word_indices)

print("Word indices:", word_indices)
print("Embeddings:", embeddings)
print("Shape of embeddings:", embeddings.shape)


Word indices: tensor([  1,   4,   2,   8,  99, 100])
Embeddings: tensor([[ 6.4076e-01,  5.8325e-01,  1.0669e+00, -4.5015e-01, -1.8527e-01,
          7.5276e-01,  4.0476e-01,  1.7847e-01,  2.6491e-01,  1.2732e+00,
         -1.3109e-03, -3.0360e-01, -1.4570e+00, -1.0234e-01, -5.9915e-01,
          4.7706e-01,  7.2618e-01,  9.1152e-02, -3.8907e-01,  5.2792e-01,
         -1.2685e-02,  2.4084e-01,  1.3254e-01,  7.6424e-01,  1.0950e+00,
          3.3989e-01,  7.1997e-01,  4.1141e-01,  1.9312e+00,  1.0119e+00,
         -1.4364e+00, -1.1299e+00, -1.3603e-01,  1.6354e+00,  6.5474e-01,
          5.7600e-01,  1.1415e+00,  1.8565e-02, -1.8058e+00,  9.2543e-01,
         -3.7534e-01,  1.0331e+00, -6.8665e-01,  6.3681e-01, -9.7267e-01,
          9.5846e-01,  1.6192e+00,  1.4506e+00,  2.6948e-01, -2.1038e-01,
         -7.3280e-01,  1.0430e-01,  3.4875e-01,  9.6759e-01, -4.6569e-01,
          1.6048e+00, -2.4801e+00, -4.1754e-01, -1.1955e+00,  8.1234e-01,
         -1.9006e+00,  2.2858e-01,  2.4859e-02,

In [None]:
# Now, let's create the RNNModel class with the specified layers and forward pass.

class RNNModel(nn.Module):

    def __init__(self, input_size, embedding_dim, hidden_size, num_layers, output_size):

        torch.manual_seed(42)
        # Call the parent class's initialization method
        super(RNNModel, self).__init__()

        # Initialize the embedding layer
        self.embedding = nn.Embedding(input_size, embedding_dim)

        # Initialize the RNN layer with specified parameters
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)

        # Initialize the linear layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Pass the input through the embedding layer
        embedded = self.embedding(x)

        # Pass the embeddings through the RNN layer
        # `output` contains all time steps, `hidden` contains the last hidden state
        output, hidden = self.rnn(embedded)

        # Use the last hidden state to produce the final output
        # We use hidden.squeeze(0) to remove unnecessary dimensions for linear layer
        final_output = self.fc(hidden[-1, :, :])

        return final_output


In [None]:
len(vocab)

762

In [None]:
# Instantiate the model with the given parameters
input_size = 762  # Example vocabulary size
embedding_dim = 100
hidden_size = 256
num_layers = 2
output_size = 1

model = RNNModel(input_size, embedding_dim, hidden_size, num_layers, output_size)



In [None]:
total_params = sum(p.numel() for p in model.parameters())

total_params

train_tensor.shape

torch.Size([800, 100])

In [None]:
# Calculate the total number of parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Total number of parameters in the model:", total_params)

Total number of parameters in the model: 299689


In [None]:
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy Loss with logits
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001

In [None]:
class ArshisDataset(torch.utils.data.Dataset):
    def __init__(self, size, data, labels):
        self.size = size
        self.data = data
        self.labels = labels

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]



# tensors = [torch.tensor([x], dtype=torch.float32) for x in train_df['label']]

# train_label = torch.stack(tensors)  # Stacks tensors along a new dimension

# label_train = np.array(train_df['label'])

train_dataset = ArshisDataset(800, train_tensor.long(), torch.tensor(train_df['label'].values, dtype=torch.float64))
test_dataset = ArshisDataset(200, test_tensor.long(), torch.tensor(test_df['label'].values, dtype=torch.float64))



train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)


train_loader

<torch.utils.data.dataloader.DataLoader at 0x7e6c5669ac20>

In [None]:
for inputs, labels in test_loader:
  print(labels)
  print(len(inputs[0]))

tensor([1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1.],
       dtype=torch.float64)
100
tensor([0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0.,
        0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1.],
       dtype=torch.float64)
100
tensor([1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
        0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1.],
       dtype=torch.float64)
100
tensor([1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
        0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1.],
       dtype=torch.float64)
100
tensor([1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0.,
        0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0.],
       dtype=torch.float64)
100
tensor([1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0.,
        0., 0., 1., 0., 1., 0., 0.,

In [None]:
def train_model(model, data_loader, criterion, optimizer, epochs=2):
    """
    Trains the given model using the specified data loader, loss function (criterion),
    and optimizer. The training process includes forward propagation, loss calculation,
    backpropagation, and parameter updates. The function trains for a given number of epochs.

    Parameters:
    - model (nn.Module): The PyTorch model to be trained.
    - data_loader (DataLoader): A PyTorch DataLoader providing the training data.
    - criterion (nn.Module): The loss function used to compute the loss.
    - optimizer (torch.optim.Optimizer): The optimizer used to update model parameters.
    - epochs (int): The number of epochs to train the model for.

    During each epoch, the function iterates over the data loader, performs a forward pass
    to compute model outputs, calculates the loss, and performs backpropagation to update
    model parameters. At the end of each epoch, it prints the average loss.
    """
    for epoch in range(epochs):
        total_loss = 0
        model.train()  # Set the model to training mode

        for inputs, labels in data_loader:
            optimizer.zero_grad()  # Clear gradients from the previous step

            # Forward pass to get predictions
            outputs = model(inputs)

            # Calculate loss
            loss = criterion(outputs.squeeze(), labels)  # `squeeze` is used to ensure correct dimensions

            # Backpropagation
            loss.backward()
            optimizer.step()  # Update model parameters

            total_loss += loss.item()  # Accumulate total loss

        # Print the average loss for this epoch
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(data_loader):.4f}")

# Example usage: Train the model for 2 epochs
train_model(model, train_loader, criterion, optimizer, epochs=2)


Epoch 1/2, Loss: 0.7063
Epoch 2/2, Loss: 0.6987


In [None]:
import torch

# Function to evaluate a model's performance
def evaluate_model(model, data_loader, criterion):
    """
    Evaluates the given model on the provided data loader using the specified criterion.
    Computes and returns the average loss and accuracy over the entire data loader.

    Parameters:
    - model (torch.nn.Module): The trained model to evaluate.
    - data_loader (DataLoader): The DataLoader containing the test/validation data.
    - criterion (torch.nn.Module): The loss function used to compute the loss.

    Returns:
    - average_loss (float): The average loss over the evaluation dataset.
    - accuracy (float): The accuracy of the model on the evaluation dataset.
    """
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation
        for inputs, labels in data_loader:
            # Forward pass to compute predictions
            outputs = model(inputs)

            # Calculate loss
            loss = criterion(outputs.squeeze(), labels)  # `squeeze` if needed
            total_loss += loss.item()

            # Compute accuracy
            predicted = torch.round(torch.sigmoid(outputs.squeeze()))  # For binary classification
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    # Calculate average loss and accuracy
    average_loss = total_loss / len(data_loader)
    accuracy = correct / total  # Accuracy as a fraction

    return average_loss, accuracy


# Example: Evaluate the model
average_loss, accuracy = evaluate_model(model, test_loader, criterion)

print(f"Average Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}")


Average Loss: 0.6913, Accuracy: 0.5200


In [None]:
train_test = torch.rand(800, 10)  # Example data tensor with 800 samples, 10 features
train_label_ = torch.rand(800)  # Example label tensor with 800 samples

# train_dataset = ArshisDataset(train_tensor, train_label)
train_label_



In [None]:
class ArshisDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        assert len(data) == len(labels), "Data and labels must have the same length"
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Ensure correct indexing
        return self.data[idx], self.labels[idx]


train_tensor = torch.rand(800, 10)  # Example data tensor with 800 samples, 10 features
train_label = torch.rand(800)  # Example label tensor with 800 samples

train_dataset = ArshisDataset(train_tensor, train_label)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Print the first batch
for data, labels in train_loader:
    print("Data:", data)
    print("Labels:", labels)
    break


Data: tensor([[0.0735, 0.4385, 0.5392, 0.3614, 0.2037, 0.8320, 0.4848, 0.4929, 0.7033,
         0.5768],
        [0.9705, 0.4782, 0.8531, 0.7704, 0.5869, 0.3167, 0.2523, 0.3324, 0.9485,
         0.2181],
        [0.8717, 0.7381, 0.0389, 0.7050, 0.8945, 0.5052, 0.7315, 0.8322, 0.4007,
         0.6886],
        [0.4616, 0.0584, 0.9420, 0.0287, 0.9097, 0.5480, 0.1629, 0.1288, 0.9505,
         0.6913],
        [0.6536, 0.1348, 0.0302, 0.4239, 0.0759, 0.7367, 0.3147, 0.3615, 0.3551,
         0.0914],
        [0.2778, 0.1824, 0.0697, 0.0777, 0.9462, 0.3664, 0.9064, 0.8448, 0.9228,
         0.3518],
        [0.4971, 0.1723, 0.3808, 0.9025, 0.4658, 0.5968, 0.8892, 0.4174, 0.1067,
         0.8236],
        [0.2486, 0.1902, 0.4287, 0.7010, 0.5473, 0.6647, 0.9828, 0.5185, 0.1332,
         0.6606],
        [0.7847, 0.0241, 0.8566, 0.8331, 0.6656, 0.3612, 0.1110, 0.9081, 0.7706,
         0.0075],
        [0.3872, 0.6572, 0.2446, 0.6970, 0.4065, 0.7295, 0.4252, 0.1582, 0.6111,
         0.8187],
    

In [None]:
import torch
tensor = torch.tensor([[[1], [2], [3]], [[4], [5], [6]]])
squeezed_tensor = tensor.squeeze(1)  # Removes singleton dimensions


In [None]:
squeezed_tensor

tensor([[[1],
         [2],
         [3]],

        [[4],
         [5],
         [6]]])