# Deep Learning NLP Tutorial

This notebook demonstrates basic NLP tasks using deep learning.

## 1. Setup and Imports

In [None]:
import sys
sys.path.append('../src')

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from deep_learning_nlp.models import TextClassifier
from deep_learning_nlp.data_processing import TextPreprocessor, TextDataset
from deep_learning_nlp.data_processing import DataLoader as CustomDataLoader
from deep_learning_nlp.utils import train_model, evaluate_model

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 2. Text Preprocessing

In [None]:
# Sample data
texts = [
    "This is a great movie!",
    "I loved this film.",
    "Terrible waste of time.",
    "Not worth watching.",
    "Amazing storyline and acting!",
    "Boring and predictable.",
    "Fantastic performances all around.",
    "Very disappointing."
]
labels = [1, 1, 0, 0, 1, 0, 1, 0]  # 1: positive, 0: negative

print("Sample texts:")
for text, label in zip(texts[:3], labels[:3]):
    print(f"  [{label}] {text}")

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(lowercase=True, remove_punctuation=False)

# Build vocabulary
word2idx, idx2word = preprocessor.build_vocab(texts, min_freq=1)

print(f"Vocabulary size: {len(word2idx)}")
print(f"\nFirst 10 words in vocabulary:")
print(list(word2idx.items())[:10])

## 3. Create Dataset and DataLoader

In [None]:
# Split data
train_texts, train_labels = texts[:6], labels[:6]
test_texts, test_labels = texts[6:], labels[6:]

# Create datasets
train_dataset = TextDataset(train_texts, train_labels, word2idx, max_length=20)
test_dataset = TextDataset(test_texts, test_labels, word2idx, max_length=20)

# Create data loaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=2, 
    shuffle=True, 
    collate_fn=CustomDataLoader.collate_fn
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=2, 
    shuffle=False, 
    collate_fn=CustomDataLoader.collate_fn
)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

## 4. Define Model

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize model
model = TextClassifier(
    vocab_size=len(word2idx),
    embedding_dim=50,
    hidden_dim=128,
    output_dim=2,
    n_layers=2,
    bidirectional=True,
    dropout=0.5
)

print(f"\nModel architecture:")
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

## 5. Train Model

In [None]:
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train
model, train_losses = train_model(
    model, 
    train_loader, 
    criterion, 
    optimizer, 
    device, 
    num_epochs=10
)

## 6. Evaluate Model

In [None]:
# Evaluate
test_loss, accuracy = evaluate_model(model, test_loader, criterion, device)

## 7. Visualize Training

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, marker='o')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

## 8. Test Predictions

In [None]:
def predict_sentiment(text, model, word2idx, preprocessor, device):
    """Predict sentiment for a single text."""
    model.eval()
    
    # Encode text
    encoded = preprocessor.encode_text(text, word2idx)
    text_tensor = torch.LongTensor(encoded).unsqueeze(0).to(device)
    text_length = torch.LongTensor([len(encoded)]).to(device)
    
    # Predict
    with torch.no_grad():
        output = model(text_tensor, text_length)
        probabilities = torch.softmax(output, dim=1)
        prediction = torch.argmax(probabilities, dim=1).item()
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    confidence = probabilities[0][prediction].item()
    
    return sentiment, confidence

# Test with new examples
test_examples = [
    "This movie is absolutely wonderful!",
    "I really disliked this film.",
    "It was okay, not great but not terrible."
]

print("Testing predictions on new examples:\n")
for text in test_examples:
    sentiment, confidence = predict_sentiment(text, model, word2idx, preprocessor, device)
    print(f"Text: {text}")
    print(f"Prediction: {sentiment} (confidence: {confidence:.2%})\n")