### RNN with pytorch


In [None]:
!pip show torch torchtext

Name: torch
Version: 2.0.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, jinja2, networkx, nvidia-cublas-cu11, nvidia-cuda-cupti-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-runtime-cu11, nvidia-cudnn-cu11, nvidia-cufft-cu11, nvidia-curand-cu11, nvidia-cusolver-cu11, nvidia-cusparse-cu11, nvidia-nccl-cu11, nvidia-nvtx-cu11, sympy, triton, typing-extensions
Required-by: accelerate, fastai, peft, sentence-transformers, timm, torchaudio, torchdata, torchtext, torchvision, triton
---
Name: torchtext
Version: 0.15.2
Summary: Text utilities and datasets for PyTorch
Home-page: https://github.com/pytorch/text
Author: PyTorch core devs and James Bradbury
Author-email: jekbradbury@gmail.com
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, requests, torch, torch

In [None]:
!pip install torch==2.0.1 torchtext==0.15.2



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np

In [2]:
# Load datasets
train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")

print("Train data dimension ", train_df.shape)
print("Dev data dimension ", dev_df.shape)

Train data dimension  (92228, 3)
Dev data dimension  (4855, 2)


In [None]:
train_df.head()

Unnamed: 0,text_id,sentence,gold_label
0,r1-0051002,"Cheers,\n\nDennis Nguyen\n416-879-6431",0
1,r1-0020356,May have to wait longer on holidays.,-1
2,r1-0058348,"I drove to vegas may 6th, to get my hair done.",0
3,r1-0080006,"In addition, I eat out often at various restau...",1
4,r1-0000827,Perhaps she was doing us a favor?,0


In [None]:
# Load the tokenizer and embeddings
tokenizer = get_tokenizer('spacy' language='en_core_web_sm')
glove = GloVe(name='6B', dim=300)

In [4]:
# Define the Dataset class
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, vocab, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sentence = self.df.iloc[idx]['sentence']
        tokens = self.tokenizer(sentence)
        token_ids = [self.vocab[token] for token in tokens if token in self.vocab]

        if self.is_train:
            label = self.df.iloc[idx]['gold_label']
            return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

        return torch.tensor(token_ids, dtype=torch.long), self.df.iloc[idx]['text_id']


# Custom collate function
def collate_fn(batch, is_train=True):
    token_ids = [item[0] for item in batch]
    padded_sequences = pad_sequence(token_ids, batch_first=True, padding_value=0)

    if is_train:
        labels = torch.stack([item[1] for item in batch])
        return padded_sequences, labels

    ids = [item[1] for item in batch]
    return padded_sequences, ids

In [5]:
# Define the Improved RNN Model
class SentimentBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings):
        super(SentimentBiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(pretrained_embeddings)
        self.embedding.weight.requires_grad = False  # Freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.5)  # Dropout for regularization
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bidirectional LSTM output

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out)
        hidden_state = torch.cat((lstm_out[:, -1, :self.lstm.hidden_size],
                                  lstm_out[:, 0, self.lstm.hidden_size:]), dim=1)
        output = self.fc(hidden_state)
        return output

In [6]:
# Prepare datasets and dataloaders
vocab = {word: idx for idx, word in enumerate(glove.itos)}
train_dataset = SentimentDataset(train_df, tokenizer, vocab, is_train=True)
dev_dataset = SentimentDataset(dev_df, tokenizer, vocab, is_train=False)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    collate_fn=lambda batch: collate_fn(batch, is_train=True),
    shuffle=True,
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=64,
    collate_fn=lambda batch: collate_fn(batch, is_train=False),
)

In [7]:
# Initialize the model
model = SentimentBiLSTM(len(vocab), 300, 128, 3, torch.tensor(glove.vectors))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

  model = SentimentBiLSTM(len(vocab), 300, 128, 3, torch.tensor(glove.vectors))


In [8]:
# Train the model
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels + 1)  # Shift labels to 0, 1, 2
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == (labels + 1)).sum().item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100*correct/total:.2f}%")

Epoch 1/10, Loss: 0.8415, Accuracy: 62.74%
Epoch 2/10, Loss: 0.7583, Accuracy: 67.65%
Epoch 3/10, Loss: 0.7201, Accuracy: 69.38%
Epoch 4/10, Loss: 0.6894, Accuracy: 71.12%
Epoch 5/10, Loss: 0.6586, Accuracy: 72.57%
Epoch 6/10, Loss: 0.6306, Accuracy: 74.04%
Epoch 7/10, Loss: 0.6029, Accuracy: 75.40%
Epoch 8/10, Loss: 0.5713, Accuracy: 76.78%
Epoch 9/10, Loss: 0.5409, Accuracy: 78.14%
Epoch 10/10, Loss: 0.5096, Accuracy: 79.57%


In [9]:
# Predict on dev set
model.eval()
predictions = []

with torch.no_grad():
    for inputs, _ in dev_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy() - 1)  # Shift back to -1, 0, 1

# Save only the predicted labels to 'answer_2.txt'
with open("answer_2.txt", "w") as f:
    for label in predictions:
        f.write(f"{label}\n")

print("Predicted labels saved to 'answer_2.txt'.")

Predicted labels saved to 'answer_2.txt'.
