## Project Introduction: Document Classification with Machine Learning

In this personal project, I explore how machine learning can be used to automatically classify news articles into relevant categories. This kind of document classification is a key component of intelligent content search engines, helping users find information quickly and accurately.

To implement this, I’ll use **PyTorch** and the **torchtext** library to preprocess and prepare raw text data for model training. The pipeline includes converting text into tensors, organizing it into batches, and building a classifier capable of predicting the topic of a given article.

This project demonstrates the practical application of natural language processing (NLP) and deep learning in organizing large volumes of unstructured text—a valuable capability for search, recommendation, and information retrieval systems.


In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os

In [None]:
torch.manual_seed(42)
np.random.seed(42)

class NewsDataset(Dataset):
    """Custom Dataset class for news articles"""
    
    def __init__(self, texts, labels, vocab, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize and convert to indices
        tokens = self.tokenizer(text.lower())
        indices = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
        
        # Pad or truncate to max_length
        if len(indices) > self.max_length:
            indices = indices[:self.max_length]
        else:
            indices.extend([self.vocab['<PAD>']] * (self.max_length - len(indices)))
            
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

In [None]:
class TextClassifier(nn.Module):
    """Neural network model for text classification"""
    
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=2, dropout=0.3):
        super(TextClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, 
                           dropout=dropout if n_layers > 1 else 0, 
                           batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional
        
    def forward(self, x):
        # x shape: (batch_size, seq_length)
        embedded = self.embedding(x)  # (batch_size, seq_length, embed_dim)
        
        # LSTM layer
        lstm_out, (hidden, _) = self.lstm(embedded)
        
        # Use the last hidden state (forward and backward)
        # hidden shape: (n_layers*2, batch_size, hidden_dim)
        forward_hidden = hidden[-2]  # Last layer forward
        backward_hidden = hidden[-1]  # Last layer backward
        
        # Concatenate forward and backward hidden states
        final_hidden = torch.cat((forward_hidden, backward_hidden), dim=1)
        
        # Apply dropout and final linear layer
        output = self.dropout(final_hidden)
        output = self.fc(output)
        
        return output

In [None]:
class NewsClassifier:
    """Main class for news article classification"""
    
    def __init__(self, embed_dim=128, hidden_dim=256, max_length=512):
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.max_length = max_length
        self.tokenizer = get_tokenizer('basic_english')
        self.vocab = None
        self.label_to_idx = None
        self.idx_to_label = None
        self.model = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def build_vocab(self, texts, min_freq=2):
        """Build vocabulary from training texts"""
        print("Building vocabulary...")
        
        # Tokenize all texts and count word frequencies
        counter = Counter()
        for text in texts:
            tokens = self.tokenizer(str(text).lower())
            counter.update(tokens)
        
        # Create vocabulary with special tokens
        vocab = {'<PAD>': 0, '<UNK>': 1}
        for word, freq in counter.items():
            if freq >= min_freq:
                vocab[word] = len(vocab)
                
        print(f"Vocabulary size: {len(vocab)}")
        self.vocab = vocab
        return vocab


    def prepare_labels(self, labels):
        """Convert labels to numerical indices"""
        unique_labels = sorted(list(set(labels)))
        self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
        self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}
        
        numerical_labels = [self.label_to_idx[label] for label in labels]
        return numerical_labels
    
    def create_data_loaders(self, train_texts, train_labels, test_texts, test_labels, batch_size=32):
        """Create PyTorch data loaders"""
        train_dataset = NewsDataset(train_texts, train_labels, self.vocab, 
                                  self.tokenizer, self.max_length)
        test_dataset = NewsDataset(test_texts, test_labels, self.vocab, 
                                 self.tokenizer, self.max_length)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        
        return train_loader, test_loader
    
    def build_model(self, num_classes):
        """Build the neural network model"""
        vocab_size = len(self.vocab)
        self.model = TextClassifier(
            vocab_size=vocab_size,
            embed_dim=self.embed_dim,
            hidden_dim=self.hidden_dim,
            output_dim=num_classes
        ).to(self.device)
        
        print(f"Model built with {sum(p.numel() for p in self.model.parameters())} parameters")
        return self.model


    def train(self, train_loader, test_loader, num_epochs=10, learning_rate=0.001):
        """Train the model"""
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        
        print(f"Training on {self.device}")
        print("-" * 50)
        
        for epoch in range(num_epochs):
            # Training phase
            self.model.train()
            total_loss = 0
            correct_predictions = 0
            total_samples = 0
            
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(self.device), target.to(self.device)
                
                optimizer.zero_grad()
                output = self.model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                pred = output.argmax(dim=1)
                correct_predictions += (pred == target).sum().item()
                total_samples += target.size(0)
            
            # Calculate training accuracy
            train_accuracy = correct_predictions / total_samples
            avg_loss = total_loss / len(train_loader)
            
            # Validation phase
            test_accuracy = self.evaluate(test_loader)
            
            print(f'Epoch {epoch+1}/{num_epochs}:')
            print(f'  Train Loss: {avg_loss:.4f} | Train Acc: {train_accuracy:.4f}')
            print(f'  Test Acc: {test_accuracy:.4f}')
            print("-" * 50)

    def evaluate(self, test_loader):
        """Evaluate the model"""
        self.model.eval()
        correct_predictions = 0
        total_samples = 0
        
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                pred = output.argmax(dim=1)
                correct_predictions += (pred == target).sum().item()
                total_samples += target.size(0)
        
        accuracy = correct_predictions / total_samples
        return accuracy

    def predict(self, texts):
        """Make predictions on new texts"""
        self.model.eval()
        predictions = []
        
        with torch.no_grad():
            for text in texts:
                # Tokenize and convert to indices
                tokens = self.tokenizer(str(text).lower())
                indices = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
                
                # Pad or truncate
                if len(indices) > self.max_length:
                    indices = indices[:self.max_length]
                else:
                    indices.extend([self.vocab['<PAD>']] * (self.max_length - len(indices)))
                
                # Convert to tensor and add batch dimension
                input_tensor = torch.tensor([indices], dtype=torch.long).to(self.device)
                
                # Make prediction
                output = self.model(input_tensor)
                pred_idx = output.argmax(dim=1).item()
                pred_label = self.idx_to_label[pred_idx]
                predictions.append(pred_label)
        
        return predictions

def save_model(self, filepath):
        """Save the trained model and vocabulary"""
        checkpoint = {
            'model_state_dict': self.model.state_dict(),
            'vocab': self.vocab,
            'label_to_idx': self.label_to_idx,
            'idx_to_label': self.idx_to_label,
            'embed_dim': self.embed_dim,
            'hidden_dim': self.hidden_dim,
            'max_length': self.max_length
        }
        torch.save(checkpoint, filepath)
        print(f"Model saved to {filepath}")

def load_model(self, filepath, num_classes):
        """Load a trained model"""
        checkpoint = torch.load(filepath, map_location=self.device)
        
        self.vocab = checkpoint['vocab']
        self.label_to_idx = checkpoint['label_to_idx']
        self.idx_to_label = checkpoint['idx_to_label']
        self.embed_dim = checkpoint['embed_dim']
        self.hidden_dim = checkpoint['hidden_dim']
        self.max_length = checkpoint['max_length']
        
        self.build_model(num_classes)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        print(f"Model loaded from {filepath}")
    

In [None]:
# Sample usage
def create_sample_data():
    """Create sample news data for demonstration"""
    sample_data = [
        # Technology articles
        ("Apple releases new iPhone with advanced AI capabilities and improved camera system", "Technology"),
        ("Google announces breakthrough in quantum computing research", "Technology"),
        ("Microsoft launches new cloud computing platform for enterprises", "Technology"),
        ("Tesla unveils latest electric vehicle model with autonomous driving features", "Technology"),
        ("Meta develops new virtual reality headset for gaming and productivity", "Technology"),
        
        # Sports articles
        ("Basketball championship finals draw record-breaking television audience", "Sports"),
        ("Olympic swimming records broken in international competition", "Sports"),
        ("Football team wins championship after dramatic overtime victory", "Sports"),
        ("Tennis tournament features intense matches between top-ranked players", "Sports"),
        ("Soccer world cup preparations underway in host country", "Sports"),
        
        # Politics articles
        ("Congress passes new legislation on healthcare reform", "Politics"),
        ("Presidential election campaign enters final phase with debates", "Politics"),
        ("Supreme Court ruling affects voting rights across the nation", "Politics"),
        ("International summit focuses on climate change policies", "Politics"),
        ("Senate committee investigates government spending on infrastructure", "Politics"),
        
        # Business articles
        ("Stock market reaches new highs amid economic recovery", "Business"),
        ("Major corporation announces merger with international partner", "Business"),
        ("Cryptocurrency prices fluctuate following regulatory announcements", "Business"),
        ("Retail sales increase during holiday shopping season", "Business"),
        ("Oil prices rise due to supply chain disruptions", "Business"),
    ]
    
    texts, labels = zip(*sample_data)
    return list(texts), list(labels)

In [None]:
def main():
    """Main function to demonstrate the news classifier"""
    print("News Article Classification with PyTorch")
    print("=" * 50)
    
    # Create sample data
    texts, labels = create_sample_data()
    print(f"Created {len(texts)} sample articles across {len(set(labels))} categories")
    
    # Initialize classifier
    classifier = NewsClassifier(embed_dim=64, hidden_dim=128, max_length=256)
    
    # Split data
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=0.3, random_state=42, stratify=labels
    )
    
    # Build vocabulary and prepare labels
    classifier.build_vocab(train_texts, min_freq=1)
    train_labels_num = classifier.prepare_labels(train_labels)
    test_labels_num = [classifier.label_to_idx[label] for label in test_labels]
    
    # Create data loaders
    train_loader, test_loader = classifier.create_data_loaders(
        train_texts, train_labels_num, test_texts, test_labels_num, batch_size=4
    )
    
    # Build and train model
    num_classes = len(classifier.label_to_idx)
    classifier.build_model(num_classes)
    classifier.train(train_loader, test_loader, num_epochs=15, learning_rate=0.001)
    
    # Make predictions on new text
    new_articles = [
        "New smartphone features artificial intelligence and machine learning capabilities",
        "Football team scores winning touchdown in championship game",
        "Stock market experiences volatility due to economic uncertainty"
    ]
    
    print("\nPredictions on new articles:")
    print("-" * 30)
    predictions = classifier.predict(new_articles)
    for article, prediction in zip(new_articles, predictions):
        print(f"Article: {article[:60]}...")
        print(f"Predicted Category: {prediction}\n")
    
    # Save the model
    classifier.save_model('news_classifier_model.pth')

if __name__ == "__main__":
    main()