In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
import time
import os

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from model import ArabicPoetryLSTM
from dataset import ArabicPoetryDataset

In [3]:
dataset = ArabicPoetryDataset('../data/all_data.txt')

In [4]:
print(f"Dataset size: {len(dataset)}")
print(f"Vocabulary size: {dataset.vocab_size}")


Dataset size: 212494
Vocabulary size: 82


In [5]:
sample_input = dataset[0]
print("Sample decoded:")
print(''.join([dataset.idx_to_char[idx.item()] for idx in sample_input]))


Sample decoded:
Input: بدت تختال في حُلل الجمالِ


In [6]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=dataset.collate_fn)

In [7]:
dataloader_iter = iter(dataloader)

In [8]:
first = next(dataloader_iter)

In [9]:
padded_sequences, targets, mask = first

In [10]:
padded_sequences.shape, targets.shape, mask.shape

(torch.Size([2, 24]), torch.Size([2, 24]), torch.Size([2, 24]))

In [11]:
for i in range(len((padded_sequences))):
    seq = padded_sequences[i]
    print(''.join([dataset.idx_to_char[idx.item()] for idx in seq]))

بدت تختال في حُلل الجمال
وجادت بالزيارة والوصال<pad><pad>


In [12]:
for i in range(len((targets))):
    seq = targets[i]
    print(''.join([dataset.idx_to_char[idx.item()] for idx in seq]))

دت تختال في حُلل الجمالِ
جادت بالزيارة والوصال<pad><pad><pad>


In [18]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from model import ArabicPoetryLSTM
import time
import os

# Hyperparameters
BATCH_SIZE = 64
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.5
LEARNING_RATE = 0.001
NUM_EPOCHS = 1#50
CLIP = 5  # Gradient clipping

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=dataset.collate_fn)


In [23]:
len(train_loader), len(val_loader)

(2657, 665)

In [19]:
# Initialize model
model = ArabicPoetryLSTM(vocab_size=dataset.vocab_size, 
                        hidden_dim=HIDDEN_DIM, 
                        num_layers=NUM_LAYERS, 
                        dropout=DROPOUT).to(device)


In [20]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=dataset.pad_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [24]:
device

device(type='cpu')

In [26]:
from train import train, evaluate, generate_text

In [28]:
train_losses = []
val_losses = []

for epoch in range(NUM_EPOCHS):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    if (epoch + 1) % 10 == 0:
        start_sequence = dataset.sequences[0][:10]  # Use the first 10 characters of the first sequence
        generated_text = generate_text(model, dataset, start_sequence, max_length=200, device=device)
        print(f'Generated text:\n{generated_text}\n')


KeyboardInterrupt: 

In [None]:
    # Plot the losses
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Losses')
    plt.legend()
    plt.show()
    
    # Save the model
    torch.save(model.state_dict(), 'arabic_poetry_generator.pth')