# Pytorch

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import json
from tqdm import tqdm  # Import tqdm for progress bars
import os

# Hyperparameters
BATCH_SIZE = 128
EPOCHS = 3
SEQ_LENGTH = 10
EMBEDDING_DIM = 50  # Dimension of the embedding layer
HIDDEN_SIZE = 100  # RNN hidden state size
LEARNING_RATE = 0.001

# Folder paths (use the 100000_examples_10349_vocab_90_percent folder)
# FOLDER_NAME = '50000_examples_10160_vocab_90_percent'
FOLDER_NAME = '5000_examples_8093_vocab_90_percent'
base_dir = f'../datasets/wiki-103-text/preprocessed/{FOLDER_NAME}'
train_path = f'{base_dir}/wiki-103_train.csv'
test_path = f'{base_dir}/wiki-103_test.csv'
word_to_index_path = f'{base_dir}/word_to_index.json'
index_to_word_path = f'{base_dir}/index_to_word.json'

# Load the preprocessed dataset
print(f"Loading train dataset at {FOLDER_NAME}")
df_train = pd.read_csv(train_path)


# Load the word-to-index and index-to-word mappings
print("Loading word to index mapping...")
with open(word_to_index_path, 'r') as f:
    word_to_index = json.load(f)

with open(index_to_word_path, 'r') as f:
    index_to_word = json.load(f)

vocab_size = len(word_to_index)
output_size = vocab_size  # Output size should be the size of the vocabulary

# Dataset class
class WikiDataset(Dataset):
    def __init__(self, data, word_to_index):
        self.data = data['clean_text']
        self.word_to_index = word_to_index

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]

        # Skip empty or non-string entries
        if not isinstance(text, str) or text.strip() == '':
            text = ''

        # Split tokenized sentence
        text = text.split()
        
        # Convert tokens to integers using word_to_index mapping
        text = [self.word_to_index.get(word, 0) for word in text]  # Use 0 (or another index) for unknown words

        input_sequence = torch.tensor(text[:-1], dtype=torch.long)  # All except the last word
        target = torch.tensor(text[1:], dtype=torch.long)  # Shifted by one word

        return input_sequence, target

# Custom collate function to pad sequences dynamically within batches
def collate_fn(batch):
    inputs, targets = zip(*batch)
    
    # Pad sequences in the batch to the maximum length within the batch
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)

    return inputs_padded, targets_padded

# Load the dataset
print("Preparing dataset...")
train_dataset = WikiDataset(df_train, word_to_index)

# DataLoader with the custom collate function
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Define the RNN Model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Pass through embedding layer
        embedded = self.embedding(x)
        
        # Pass through RNN
        rnn_out, hidden = self.rnn(embedded)
        
        # Apply the fully connected layer to each time step output
        out = self.fc(rnn_out)
        
        return out

# Initialize model, loss function, and optimizer
print("Initializing model...")
model = RNNModel(vocab_size, EMBEDDING_DIM, HIDDEN_SIZE, output_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Check if CUDA is available and use the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model running on {'GPU' if torch.cuda.is_available() else 'CPU'}.")

# Training and evaluation loop
def evaluate_model(model, dataloader, loss_fn, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    correct_preds = 0
    total_words = 0
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            outputs = outputs.view(-1, vocab_size)
            targets = targets.view(-1)

            # Calculate loss
            loss = loss_fn(outputs, targets)
            total_loss += loss.item()

            # Calculate accuracy
            _, predictions = torch.max(outputs, 1)
            correct_preds += (predictions == targets).sum().item()
            total_words += targets.size(0)
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_preds / total_words
    return avg_loss, accuracy

# Training loop
print("Starting training...")
for epoch in range(EPOCHS):
    total_loss = 0.0
    model.train()  # Set the model to training mode
    print(f"Epoch {epoch+1}/{EPOCHS}:")

    # Use tqdm to create a progress bar for the DataLoader
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}", unit="batch")

    for batch_idx, (inputs, targets) in enumerate(progress_bar):
        # Move data to GPU if available
        inputs, targets = inputs.to(device), targets.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        
        # Reshape outputs and targets for loss calculation
        outputs = outputs.view(-1, vocab_size)  # Reshape to (batch_size * seq_length, vocab_size)
        targets = targets.view(-1)  # Flatten targets to match output shape
        
        # Compute loss
        loss = loss_fn(outputs, targets)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update the tqdm progress bar with the current loss
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

    # Evaluate on the training set
    print("Evaluating on the train set...")
    train_loss, train_accuracy = evaluate_model(model, train_dataloader, loss_fn, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy * 100:.2f}%\n")


Loading train dataset at 5000_examples_8093_vocab_90_percent
Loading word to index mapping...
Preparing dataset...
Initializing model...
Model running on GPU.
Starting training...
Epoch 1/3:


Epoch 1/3: 100%|██████████| 36/36 [00:21<00:00,  1.71batch/s, loss=1.91]


Epoch 1 completed. Average Loss: 4.2579
Evaluating on the train set...
Train Loss: 1.2897, Train Accuracy: 88.29%

Epoch 2/3:


Epoch 2/3: 100%|██████████| 36/36 [00:19<00:00,  1.86batch/s, loss=1.64] 


Epoch 2 completed. Average Loss: 1.1821
Evaluating on the train set...
Train Loss: 1.1431, Train Accuracy: 88.18%

Epoch 3/3:


Epoch 3/3: 100%|██████████| 36/36 [00:21<00:00,  1.68batch/s, loss=0.801]


Epoch 3 completed. Average Loss: 1.0665
Evaluating on the train set...
Train Loss: 1.0663, Train Accuracy: 87.84%

