In [1]:
import math
from collections import Counter
import random

In [3]:
import math
import random
from collections import Counter

def load_and_split_data(file_path, train_ratio=0.7, val_ratio=0.15):
    """
    Loads text data from a file and splits it into training, validation, and test sets.
    Args:
        file_path (str): Path to the data file
        train_ratio (float): Proportion of data for training
        val_ratio (float): Proportion of data for validation
    Returns:
        tuple: Contains train, validation, and test data and labels
    """
    print(f"Reading data from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    print(f"Total number of samples loaded: {len(lines)}")
    
    # Count label distribution
    label_counts = Counter(line.split('\t')[0] for line in lines)
    print("\nLabel Distribution:")
    for label, count in label_counts.items():
        print(f"{label}: {count} ({count/len(lines)*100:.2f}%)")
    
    # Set random seed for reproducibility
    random.seed(42)
    random.shuffle(lines)
    
    # Calculate split points
    train_end = int(len(lines) * train_ratio)
    val_end = int(len(lines) * (train_ratio + val_ratio))
    
    # Split data
    train_data = lines[:train_end]
    val_data = lines[train_end:val_end]
    test_data = lines[val_end:]
    
    def split_data(data):
        sentences = []
        labels = []
        for line in data:
            if '\t' in line:
                label, text = line.strip().split('\t', 1)
                # Convert 'spam' to 1 and 'ham' to 0
                label_int = 1 if label.lower() == 'spam' else 0
                labels.append(label_int)
                sentences.append(text)
        return sentences, labels
    
    return split_data(train_data) + split_data(val_data) + split_data(test_data)

def preprocess_text(text):
    """
    Cleans text by removing special characters and converting to lowercase.
    Args:
        text (str): Input text to clean
    Returns:
        str: Cleaned text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = ' '.join(word for word in text.split() 
                   if not word.startswith(('http:', 'https:', 'www.')))
    
    # Keep only letters and spaces
    cleaned_text = ""
    for char in text:
        if char.isalpha() or char.isspace():
            cleaned_text += char
    
    return cleaned_text

def tokenize(text):
    """
    Converts text into tokens, removes stopwords, and applies stemming.
    Args:
        text (str): Input text to tokenize
    Returns:
        list: List of processed tokens
    """
    stopwords = {
        "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", 
        "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", 
        "to", "was", "were", "will", "with", "the", "this", "but", "they",
        "have", "had", "what", "when", "where", "who", "which", "why", "how"
    }
    
    def simple_stem(word):
        """Simple word stemming rules"""
        if len(word) < 4:
            return word
        if word.endswith('ing'):
            return word[:-3]
        elif word.endswith('ed'):
            return word[:-2]
        elif word.endswith('s'):
            return word[:-1]
        return word
    
    words = text.split()
    return [simple_stem(word) for word in words 
            if word not in stopwords and len(word) > 1]

class TfidfVectorizer:
    """
    Converts text documents to TF-IDF feature vectors.
    """
    def __init__(self):
        self.vocabulary = set()
        self.idf = {}
        self.vocab_index = {}
    
    def fit(self, documents):
        """
        Builds vocabulary and computes IDF scores from documents
        Args:
            documents (list): List of tokenized documents
        """
        # Build vocabulary
        for doc in documents:
            self.vocabulary.update(doc)
        
        self.vocab_index = {word: idx for idx, word in enumerate(sorted(self.vocabulary))}
        
        # Compute document frequencies
        doc_freq = Counter()
        for doc in documents:
            doc_words = set(doc)
            for word in doc_words:
                doc_freq[word] += 1
        
        # Calculate IDF scores
        num_docs = len(documents)
        self.idf = {word: math.log((num_docs + 1)/(doc_freq[word] + 1)) + 1 
                   for word in self.vocabulary}
    
    def transform(self, documents):
        """
        Transforms documents into TF-IDF feature vectors
        Args:
            documents (list): List of tokenized documents
        Returns:
            list: List of TF-IDF feature vectors
        """
        X = []
        for doc in documents:
            tf = Counter(doc)
            doc_len = len(doc) if len(doc) > 0 else 1
            
            features = [0.0] * len(self.vocabulary)
            for word in set(doc):
                if word in self.vocab_index:
                    idx = self.vocab_index[word]
                    tf_val = tf[word] / doc_len
                    features[idx] = tf_val * self.idf.get(word, 0)
            
            X.append(features)
        return X

class LogisticRegression:
    def __init__(self, learning_rate=0.01, lambda_reg=0.1, num_epochs=100, 
                 early_stop_threshold=1e-4, batch_size=32):
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.num_epochs = num_epochs
        self.early_stop_threshold = early_stop_threshold
        self.batch_size = batch_size
        self.weights = None
        self.bias = 0
        self.training_history = []
    
    def sigmoid(self, z):
        """Compute sigmoid function"""
        # Clip z to prevent overflow
        z = min(max(z, -100), 100)
        return 1 / (1 + math.exp(-z))
    
    def compute_loss(self, X, y, indices):
        """Compute binary cross-entropy loss with L2 regularization"""
        loss = 0
        for i in indices:
            z = sum(X[i][j] * self.weights[j] for j in range(len(self.weights))) + self.bias
            pred = self.sigmoid(z)
            # Add small epsilon to avoid log(0)
            loss -= (y[i] * math.log(pred + 1e-15) + (1 - y[i]) * math.log(1 - pred + 1e-15))
        
        # Add L2 regularization term
        l2_term = self.lambda_reg * sum(w * w for w in self.weights)
        return (loss / len(indices)) + l2_term
    
    def compute_class_weights(self, y):
        """
        Compute class weights to handle class imbalance
        Args:
            y (list): Labels
        Returns:
            dict: Class weights
        """
        counts = Counter(y)
        total = len(y)
        weights = {
            0: total / (2 * counts[0]) if counts[0] > 0 else 1,
            1: total / (2 * counts[1]) if counts[1] > 0 else 1
        }
        print(f"Class weights: {weights}")
        return weights
    
    def train(self, X, y, method='sgd'):
        """
        Train the model using either SGD or mini-batch gradient descent
        Args:
            X (list): Feature vectors
            y (list): Labels
            method (str): 'sgd' or 'minibatch'
        """
        if not self.weights:
            # Initialize weights with small random values
            self.weights = [random.uniform(-0.1, 0.1) for _ in range(len(X[0]))]
        
        # Compute class weights for balanced training
        class_weights = self.compute_class_weights(y)
        
        print(f"\nTraining with {method.upper()}:")
        print("Epoch\tLoss\t\tΔLoss")
        
        n_samples = len(y)
        prev_loss = float('inf')
        no_improvement_count = 0
        
        for epoch in range(self.num_epochs):
            indices = list(range(n_samples))
            random.shuffle(indices)
            
            if method == 'sgd':
                batch_indices = [[i] for i in indices]
            else:
                batch_indices = [indices[i:i + self.batch_size] 
                               for i in range(0, len(indices), self.batch_size)]
            
            epoch_loss = 0
            for batch in batch_indices:
                weight_gradients = [0] * len(self.weights)
                bias_gradient = 0
                
                for i in batch:
                    # Forward pass
                    z = sum(X[i][j] * self.weights[j] for j in range(len(self.weights))) + self.bias
                    pred = self.sigmoid(z)
                    
                    # Apply class weights to error
                    sample_weight = class_weights[y[i]]
                    error = sample_weight * (pred - y[i])
                    
                    # Accumulate gradients
                    for j in range(len(self.weights)):
                        weight_gradients[j] += error * X[i][j]
                    bias_gradient += error
                
                # Apply updates with regularization
                batch_size = len(batch)
                for j in range(len(self.weights)):
                    reg_gradient = 2 * self.lambda_reg * self.weights[j]
                    self.weights[j] -= self.learning_rate * (weight_gradients[j]/batch_size + reg_gradient)
                self.bias -= self.learning_rate * (bias_gradient/batch_size)
                
                epoch_loss += self.compute_loss(X, y, batch)
            
            avg_loss = epoch_loss / len(batch_indices)
            loss_change = prev_loss - avg_loss
            self.training_history.append(avg_loss)
            
            if epoch % 5 == 0:
                print(f"{epoch}\t{avg_loss:.6f}\t{loss_change:.6f}")
            
            # Early stopping with patience
            if abs(loss_change) < self.early_stop_threshold:
                no_improvement_count += 1
                if no_improvement_count >= 3:  # Wait for 3 epochs of no improvement
                    print(f"\nEarly stopping at epoch {epoch}")
                    break
            else:
                no_improvement_count = 0
            
            prev_loss = avg_loss
    
    def predict(self, X):
        """Make predictions on new data"""
        predictions = []
        for x in X:
            z = sum(x[j] * self.weights[j] for j in range(len(self.weights))) + self.bias
            prob = self.sigmoid(z)
            predictions.append(1 if prob >= 0.5 else 0)
        return predictions

def evaluate(y_true, y_pred):
    """
    Compute evaluation metrics
    Returns:
        tuple: (accuracy, precision, recall, f1)
    """
    tp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 1 and yp == 1)
    tn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 0 and yp == 0)
    fp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 0 and yp == 1)
    fn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 1 and yp == 0)
    
    accuracy = (tp + tn) / len(y_true)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return accuracy, precision, recall, f1

def perform_cross_validation(X, y, lambda_values, k=5):
    """
    Perform k-fold cross-validation to find the best regularization parameter
    Args:
        X (list): Feature vectors
        y (list): Labels
        lambda_values (list): List of lambda values to try
        k (int): Number of folds
    Returns:
        float: Best lambda value
    """
    print("\nPerforming cross-validation...")
    n_samples = len(y)
    fold_size = n_samples // k
    best_lambda = None
    best_score = -float('inf')
    
    for lambda_val in lambda_values:
        print(f"\nTrying lambda = {lambda_val}")
        scores = []
        
        for fold in range(k):
            # Split data into training and validation
            val_start = fold * fold_size
            val_end = val_start + fold_size
            
            X_val = X[val_start:val_end]
            y_val = y[val_start:val_end]
            X_train = X[:val_start] + X[val_end:]
            y_train = y[:val_start] + y[val_end:]
            
            # Train model
            model = LogisticRegression(lambda_reg=lambda_val, num_epochs=50)
            model.train(X_train, y_train, method='minibatch')
            
            # Evaluate
            y_pred = model.predict(X_val)
            accuracy, _, _, f1 = evaluate(y_val, y_pred)
            scores.append(f1)
        
        avg_score = sum(scores) / len(scores)
        print(f"Average F1-score: {avg_score:.4f}")
        
        if avg_score > best_score:
            best_score = avg_score
            best_lambda = lambda_val
    
    print(f"\nBest lambda value: {best_lambda} (F1-score: {best_score:.4f})")
    return best_lambda

def main():
    # Example usage with a sample input file
    # Assuming input file format: label\ttext
    file_path = "SMSSpamCollection"  # Replace with your data file path
    
    # Step 1: Load and split data
    train_texts, train_labels, val_texts, val_labels, test_texts, test_labels = load_and_split_data(file_path)
    
    # Step 2: Preprocess and tokenize texts
    print("\nPreprocessing and tokenizing texts...")
    train_tokens = [tokenize(preprocess_text(text)) for text in train_texts]
    val_tokens = [tokenize(preprocess_text(text)) for text in val_texts]
    test_tokens = [tokenize(preprocess_text(text)) for text in test_texts]
    
    # Step 3: Create TF-IDF features
    vectorizer = TfidfVectorizer()
    vectorizer.fit(train_tokens)
    
    X_train = vectorizer.transform(train_tokens)
    X_val = vectorizer.transform(val_tokens)
    X_test = vectorizer.transform(test_tokens)
    
    # Step 4: Train and evaluate with both SGD and mini-batch
    for method in ['sgd', 'minibatch']:
        print(f"\nStep 4: Training with {method.upper()}...")
        model = LogisticRegression(
            learning_rate=0.1,  # Increased learning rate
            lambda_reg=0.01,    # Reduced regularization
            num_epochs=35,
            early_stop_threshold=1e-4,
            batch_size=32
        )
        model.train(X_train, train_labels, method=method)
        
        # Evaluate on both validation and test sets
        print(f"\nStep 5: Evaluating {method.upper()} model...")
        
        # Validation set evaluation
        val_pred = model.predict(X_val)
        val_accuracy, val_precision, val_recall, val_f1 = evaluate(val_labels, val_pred)
        print(f"\n{method.upper()} Validation Results:")
        print(f"Accuracy:  {val_accuracy:.4f}")
        print(f"Precision: {val_precision:.4f}")
        print(f"Recall:    {val_recall:.4f}")
        print(f"F1-score:  {val_f1:.4f}")
        
        # Test set evaluation
        test_pred = model.predict(X_test)
        test_accuracy, test_precision, test_recall, test_f1 = evaluate(test_labels, test_pred)
        print(f"\n{method.upper()} Test Results:")
        print(f"Accuracy:  {test_accuracy:.4f}")
        print(f"Precision: {test_precision:.4f}")
        print(f"Recall:    {test_recall:.4f}")
        print(f"F1-score:  {test_f1:.4f}")

if __name__ == "__main__":
    main()

Reading data from SMSSpamCollection...
Total number of samples loaded: 5574

Label Distribution:
ham: 4827 (86.60%)
spam: 747 (13.40%)

Preprocessing and tokenizing texts...

Step 4: Training with SGD...
Class weights: {0: 0.5758783584292885, 1: 3.794747081712062}

Training with SGD:
Epoch	Loss		ΔLoss
0	0.565894	inf
5	0.548030	0.004362
10	0.551706	0.003164
15	0.549920	0.003600
20	0.551242	0.002120
25	0.552730	-0.001959
30	0.554134	-0.004919

Step 5: Evaluating SGD model...

SGD Validation Results:
Accuracy:  0.9187
Precision: 0.6524
Recall:    0.9068
F1-score:  0.7589

SGD Test Results:
Accuracy:  0.9283
Precision: 0.6821
Recall:    0.8957
F1-score:  0.7744

Step 4: Training with MINIBATCH...
Class weights: {0: 0.5758783584292885, 1: 3.794747081712062}

Training with MINIBATCH:
Epoch	Loss		ΔLoss
0	0.826326	inf
5	0.604667	0.011946
10	0.589657	0.001560
15	0.588511	-0.005011
20	0.580150	0.010970
25	0.592079	-0.009504
30	0.588796	-0.002602

Step 5: Evaluating MINIBATCH model...

MINIBATCH 

In [14]:
import random
import math
from collections import Counter

def load_and_split_data(file_path, is_sms=True, train_ratio=0.7, val_ratio=0.15):
    """
    Loads text data from a file and splits it into training, validation, and test sets.
    Works for both SMS and author classification tasks.
    """
    print(f"Reading data from {file_path}...")
    texts = []
    labels = []
    
    if is_sms:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if '\t' in line:
                    label, text = line.strip().split('\t', 1)
                    label_int = 1 if label.lower() == 'spam' else 0
                    labels.append(label_int)
                    texts.append(text)
    else:
        author_map = {"Arthur Conan Doyle": 0, "Fyodor Dostoyevsky": 1, "Jane Austen": 2}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if '\t' in line:
                    author, text = line.strip().split('\t', 1)
                    if author in author_map:
                        labels.append(author_map[author])
                        texts.append(text)
    
    if not texts or not labels:
        raise ValueError("No valid data found in the input file")
    
    # Print data distribution
    label_counts = Counter(labels)
    print("\nLabel Distribution:")
    for label, count in sorted(label_counts.items()):
        print(f"Label {label}: {count} ({count/len(labels)*100:.2f}%)")
    
    # Shuffle and split data
    combined = list(zip(texts, labels))
    random.seed(42)
    random.shuffle(combined)
    texts, labels = zip(*combined)
    
    train_end = int(len(texts) * train_ratio)
    val_end = int(len(texts) * (train_ratio + val_ratio))
    
    if train_end == 0 or val_end == train_end:
        raise ValueError("Dataset too small for the specified split ratios")
    
    return (
        (list(texts[:train_end]), list(labels[:train_end])),
        (list(texts[train_end:val_end]), list(labels[train_end:val_end])),
        (list(texts[val_end:]), list(labels[val_end:]))
    )

def preprocess_text(text):
    """Cleans text by removing special characters and converting to lowercase."""
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = ' '.join(word for word in text.split() 
                   if not word.startswith(('http:', 'https:', 'www.')))
    
    # Keep only letters and spaces
    cleaned_text = ""
    for char in text:
        if char.isalpha() or char.isspace():
            cleaned_text += char
    
    return ' '.join(cleaned_text.split())  # Normalize whitespace

def tokenize(text):
    """Converts text into tokens, removes stopwords, and applies stemming."""
    if not isinstance(text, str):
        return []
        
    stopwords = {
        "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", 
        "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", 
        "to", "was", "were", "will", "with", "the", "this", "but", "they",
        "have", "had", "what", "when", "where", "who", "which", "why", "how"
    }
    
    def simple_stem(word):
        """Simple word stemming rules"""
        if not word or len(word) < 4:
            return word
        if word.endswith('ing'):
            return word[:-3]
        elif word.endswith('ed'):
            return word[:-2]
        elif word.endswith('s'):
            return word[:-1]
        return word
    
    words = text.split()
    return [simple_stem(word) for word in words 
            if word and word not in stopwords and len(word) > 1]

class TfidfVectorizer:
    """Converts text documents to TF-IDF feature vectors."""
    def __init__(self):
        self.vocabulary = set()
        self.idf = {}
        self.vocab_index = {}
    
    def fit(self, documents):
        """Builds vocabulary and computes IDF scores from documents"""
        if not documents:
            raise ValueError("No documents provided for fitting")
            
        # Build vocabulary
        for doc in documents:
            if doc:  # Skip empty documents
                self.vocabulary.update(doc)
        
        if not self.vocabulary:
            raise ValueError("No valid terms found in documents")
            
        self.vocab_index = {word: idx for idx, word in enumerate(sorted(self.vocabulary))}
        
        # Compute document frequencies
        doc_freq = Counter()
        for doc in documents:
            doc_words = set(doc)
            for word in doc_words:
                doc_freq[word] += 1
        
        # Calculate IDF scores
        num_docs = len(documents)
        self.idf = {word: math.log((num_docs + 1)/(doc_freq[word] + 1)) + 1 
                   for word in self.vocabulary}
    
    def transform(self, documents):
        """Transforms documents into TF-IDF feature vectors"""
        if not documents:
            raise ValueError("No documents provided for transformation")
            
        if not self.vocabulary:
            raise ValueError("Vectorizer needs to be fitted before transform")
            
        X = []
        for doc in documents:
            tf = Counter(doc)
            doc_len = len(doc) if doc else 1
            
            features = [0.0] * len(self.vocabulary)
            for word in set(doc):
                if word in self.vocab_index:
                    idx = self.vocab_index[word]
                    tf_val = tf[word] / doc_len
                    features[idx] = tf_val * self.idf.get(word, 0)
            
            X.append(features)
        return X

class LogisticRegression:
    """Logistic Regression with support for both binary and multiclass classification."""
    def __init__(self, num_features, num_classes=2, learning_rate=0.1, lambda_reg=0.01, 
                 num_epochs=50, batch_size=64):
        if num_features <= 0:
            raise ValueError("Number of features must be positive")
        if num_classes < 2:
            raise ValueError("Number of classes must be at least 2")
        if learning_rate <= 0:
            raise ValueError("Learning rate must be positive")
        # if lambda_reg < 0:
        #     raise ValueError("Lambda regularization must be non-negative")
            
        self.num_classes = num_classes
        self.weights = [[random.uniform(-0.1, 0.1) for _ in range(num_features)] 
                       for _ in range(num_classes if num_classes > 2 else 1)]
        self.biases = [0] * (num_classes if num_classes > 2 else 1)
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.num_epochs = num_epochs
        self.batch_size = batch_size

    def sigmoid(self, z):
        """Compute sigmoid function with overflow protection"""
        if z < -709:  # np.log(np.finfo(np.float64).tiny)
            return 0.0
        elif z > 709:  # np.log(np.finfo(np.float64).max)
            return 1.0
        return 1 / (1 + math.exp(-z))

    def softmax(self, scores):
        """Compute softmax probabilities with overflow protection"""
        max_score = max(scores)
        exp_scores = [math.exp(score - max_score) for score in scores]
        total = sum(exp_scores)
        if total == 0:
            return [1.0/len(scores)] * len(scores)
        return [exp_score / total for exp_score in exp_scores]

    def train(self, X, y, early_stopping_patience=3, min_delta=0.001):
        """Train with early stopping"""
        if not X or not y:
            raise ValueError("Empty training data")
        if len(X) != len(y):
            raise ValueError("Features and labels must have the same length")
            
        n_samples = len(y)
        print("\nTraining model:")
        print("Epoch\tLoss")
        
        best_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(self.num_epochs):
            indices = list(range(n_samples))
            random.shuffle(indices)
            
            # Create mini-batches
            batches = [indices[i:i + self.batch_size] 
                      for i in range(0, len(indices), self.batch_size)]
            
            for batch in batches:
                if self.num_classes == 2:
                    self._train_binary_batch(X, y, batch)
                else:
                    self._train_multiclass_batch(X, y, batch)
            
            # Calculate loss
            current_loss = self._calculate_loss(X, y)
            print(f"{epoch}\t{current_loss:.6f}")
            
            # Early stopping check
            if current_loss < best_loss - min_delta:
                best_loss = current_loss
                patience_counter = 0
            else:
                patience_counter += 1
                
            if patience_counter >= early_stopping_patience:
                print(f"Early stopping triggered at epoch {epoch}")
                break

    def _train_binary_batch(self, X, y, batch):
        """Train a batch for binary classification"""
        grad_w = [0] * len(self.weights[0])
        grad_b = 0
        
        for idx in batch:
            # Forward pass
            z = sum(X[idx][j] * self.weights[0][j] for j in range(len(X[idx]))) + self.biases[0]
            y_pred = self.sigmoid(z)
            
            # Gradient computation
            error = y_pred - y[idx]
            for j in range(len(X[idx])):
                grad_w[j] += error * X[idx][j]
            grad_b += error
        
        # Update weights and bias
        batch_size = len(batch)
        for j in range(len(self.weights[0])):
            reg_term = 2 * self.lambda_reg * self.weights[0][j]
            self.weights[0][j] -= self.learning_rate * (grad_w[j]/batch_size + reg_term)
        self.biases[0] -= self.learning_rate * (grad_b/batch_size)

    def _train_multiclass_batch(self, X, y, batch):
        """Train a batch for multiclass classification with class weights"""
        # Calculate class weights
        class_counts = Counter(y)
        total_samples = len(y)
        class_weights = {c: total_samples / (len(class_counts) * count) 
                        for c, count in class_counts.items()}
        
        gradients = [[0] * len(self.weights[0]) for _ in range(self.num_classes)]
        bias_grads = [0] * self.num_classes
        
        for idx in batch:
            # Forward pass
            scores = [sum(X[idx][j] * self.weights[c][j] for j in range(len(X[idx]))) + self.biases[c]
                     for c in range(self.num_classes)]
            probs = self.softmax(scores)
            
            # Gradient computation with class weights
            weight = class_weights[y[idx]]
            for c in range(self.num_classes):
                error = probs[c]
                if c == y[idx]:
                    error -= 1
                error *= weight
                
                for j in range(len(X[idx])):
                    gradients[c][j] += error * X[idx][j]
                bias_grads[c] += error
        
        # Update weights and biases
        batch_size = len(batch)
        for c in range(self.num_classes):
            for j in range(len(self.weights[c])):
                reg_term = 2 * self.lambda_reg * self.weights[c][j]
                self.weights[c][j] -= self.learning_rate * (gradients[c][j]/batch_size + reg_term)
            self.biases[c] -= self.learning_rate * (bias_grads[c]/batch_size)

    def _calculate_loss(self, X, y):
        """Calculate the total loss"""
        loss = 0
        epsilon = 1e-15  # Small constant to prevent log(0)
        
        for i in range(len(X)):
            if self.num_classes == 2:
                z = sum(X[i][j] * self.weights[0][j] for j in range(len(X[i]))) + self.biases[0]
                y_pred = self.sigmoid(z)
                y_pred = max(min(y_pred, 1 - epsilon), epsilon)  # Clip predictions
                loss -= y[i] * math.log(y_pred) + (1 - y[i]) * math.log(1 - y_pred)
            else:
                scores = [sum(X[i][j] * self.weights[c][j] for j in range(len(X[i]))) + self.biases[c]
                         for c in range(self.num_classes)]
                probs = self.softmax(scores)
                loss -= math.log(max(probs[y[i]], epsilon))
        
        # Add regularization term
        reg_term = sum(sum(w * w for w in class_weights) for class_weights in self.weights)
        loss = loss/len(X) + self.lambda_reg * reg_term
        return loss

    def predict(self, X):
        """Make predictions for input data"""
        if not X:
            raise ValueError("Empty prediction data")
            
        predictions = []
        for x in X:
            if self.num_classes == 2:
                z = sum(x[j] * self.weights[0][j] for j in range(len(x))) + self.biases[0]
                predictions.append(1 if self.sigmoid(z) >= 0.5 else 0)
            else:
                scores = [sum(x[j] * self.weights[c][j] for j in range(len(x))) + self.biases[c]
                         for c in range(self.num_classes)]
                predictions.append(scores.index(max(scores)))
        return predictions

def evaluate(y_true, y_pred, num_classes=2):
    """Compute evaluation metrics for both binary and multiclass classification"""
    if not y_true or not y_pred:
        raise ValueError("Empty evaluation data")
    if len(y_true) != len(y_pred):
        raise ValueError("Predictions and true labels must have the same length")
        
    if num_classes == 2:
        tp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 1 and yp == 1)
        tn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 0 and yp == 0)
        fp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 0 and yp == 1)
        fn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 1 and yp == 0)
        
        accuracy = (tp + tn) / len(y_true)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return accuracy, precision, recall, f1
    else:
        confusion_matrix = [[0] * num_classes for _ in range(num_classes)]
        for yt, yp in zip(y_true, y_pred):
            confusion_matrix[yt][yp] += 1
        
        accuracy = sum(confusion_matrix[i][i] for i in range(num_classes)) / len(y_true)
        
        precision = []
        recall = []
        for c in range(num_classes):
            true_pos = confusion_matrix[c][c]
            false_pos = sum(confusion_matrix[i][c] for i in range(num_classes)) - true_pos
            false_neg = sum(confusion_matrix[c][i] for i in range(num_classes)) - true_pos
            
            p = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
            r = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
            precision.append(p)
            recall.append(r)
        
        macro_precision = sum(precision) / num_classes
        macro_recall = sum(recall) / num_classes
        macro_f1 = 2 * (macro_precision * macro_recall) / (macro_precision + macro_recall) \
                   if (macro_precision + macro_recall) > 0 else 0
        
        return accuracy, macro_precision, macro_recall, macro_f1

def run_with_cross_validation(X, y, num_features, num_classes, lambda_values, max_train_time=300):
    """Run cross-validation with time limit and convergence detection"""
    import time
    
    k_folds = 5
    best_lambda = None
    best_val_f1 = -1
    convergence_threshold = 0.001  # Consider converged if improvement less than this
    
    start_time = time.time()
    
    for lambda_reg in lambda_values:
        fold_f1s = []
        
        # K-fold cross validation
        fold_size = len(X) // k_folds
        for k in range(k_folds):
            if time.time() - start_time > max_train_time:
                print("\nReached maximum training time limit")
                return best_lambda
                
            start_idx = k * fold_size
            end_idx = start_idx + fold_size
            
            X_train = X[:start_idx] + X[end_idx:]
            y_train = y[:start_idx] + y[end_idx:]
            X_val = X[start_idx:end_idx]
            y_val = y[start_idx:end_idx]
            
            model = LogisticRegression(
                num_features=num_features,
                num_classes=num_classes,
                lambda_reg=lambda_reg,
                num_epochs=20,
                batch_size=32  # Smaller batch size for faster iterations
            )
            
            model.train(X_train, y_train)
            predictions = model.predict(X_val)
            _, _, _, f1 = evaluate(y_val, predictions, num_classes)
            fold_f1s.append(f1)
        
        avg_f1 = sum(fold_f1s) / len(fold_f1s)
        print(f"\nλ={lambda_reg}: Average F1={avg_f1:.4f}")
        
        # Check for improvement
        if avg_f1 > best_val_f1 + convergence_threshold:
            best_val_f1 = avg_f1
            best_lambda = lambda_reg
        elif avg_f1 >= best_val_f1 - convergence_threshold:
            # If we're within threshold of best, prefer smaller lambda
            if best_lambda is None or lambda_reg < best_lambda:
                best_lambda = lambda_reg
                best_val_f1 = avg_f1
    
    print(f"\nBest lambda value from {k_folds}-fold cross-validation: {best_lambda}")
    return best_lambda

def process_dataset(texts, vectorizer=None):
    """Process text data through preprocessing and feature extraction"""
    # Preprocess texts
    processed_texts = [preprocess_text(text) for text in texts]
    tokenized_texts = [tokenize(text) for text in processed_texts]
    
    # Create or use existing vectorizer
    if vectorizer is None:
        vectorizer = TfidfVectorizer()
        vectorizer.fit(tokenized_texts)
    
    # Transform texts to feature vectors
    X = vectorizer.transform(tokenized_texts)
    
    return X, vectorizer

def main():
    # Part A: Binary Classification (SMS Spam)
    print("\n=== Part A: SMS Spam Classification ===")
    
    try:
        # Load SMS data
        (train_texts, train_labels), (val_texts, val_labels), (test_texts, test_labels) = \
            load_and_split_data("SMSSpamCollection", is_sms=True)
        
        # Process training data
        X_train, vectorizer = process_dataset(train_texts)
        
        # Process validation and test data using the same vectorizer
        X_val, _ = process_dataset(val_texts, vectorizer)
        X_test, _ = process_dataset(test_texts, vectorizer)
        
        # Cross-validation for lambda selection with 5-minute time limit
        lambda_values = [0.001, 0.01, 0.1, 1.0]
        best_lambda = run_with_cross_validation(
            X_train, train_labels, 
            num_features=len(vectorizer.vocabulary),
            num_classes=2,
            lambda_values=lambda_values,
            max_train_time=300  # 5 minutes maximum
        )
        
        # Train final model with best lambda
        final_model = LogisticRegression(
            num_features=len(vectorizer.vocabulary),
            num_classes=2,
            lambda_reg=best_lambda,
            num_epochs=20,  # Reduced epochs
            batch_size=32   # Smaller batch size
        )
        
        final_model.train(X_train, train_labels)
        
        # Evaluate on test set
        test_predictions = final_model.predict(X_test)
        accuracy, precision, recall, f1 = evaluate(test_labels, test_predictions, num_classes=2)
        
        print("\nFinal SMS Classification Results:")
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test Precision: {precision:.4f}")
        print(f"Test Recall: {recall:.4f}")
        print(f"Test F1: {f1:.4f}")
        
        # Part B: Multi-class Classification (Author Attribution)
        print("\n=== Part B: Author Attribution ===")
        
        # Load author data
        (train_texts, train_labels), (val_texts, val_labels), (test_texts, test_labels) = \
            load_and_split_data("books.txt", is_sms=False)
        
        # Process training data
        X_train, vectorizer = process_dataset(train_texts)
        
        # Process validation and test data using the same vectorizer
        X_val, _ = process_dataset(val_texts, vectorizer)
        X_test, _ = process_dataset(test_texts, vectorizer)
        
        # Cross-validation for lambda selection with 5-minute time limit
        best_lambda = run_with_cross_validation(
            X_train, train_labels,
            num_features=len(vectorizer.vocabulary),
            num_classes=3,
            lambda_values=lambda_values,
            max_train_time=300
        )
        
        # Train final model with best lambda
        final_model = LogisticRegression(
            num_features=len(vectorizer.vocabulary),
            num_classes=3,
            learning_rate=0.2,
            lambda_reg=best_lambda,
            num_epochs=20,  # Reduced epochs
            batch_size=32   # Smaller batch size
        )
        
        final_model.train(X_train, train_labels)
        
        # Evaluate on test set
        test_predictions = final_model.predict(X_test)
        accuracy, macro_precision, macro_recall, macro_f1 = evaluate(
            test_labels, test_predictions, num_classes=3
        )
        
        print("\nFinal Author Attribution Results:")
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test Macro-Precision: {macro_precision:.4f}")
        print(f"Test Macro-Recall: {macro_recall:.4f}")
        print(f"Test Macro-F1: {macro_f1:.4f}")
        
    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()


=== Part A: SMS Spam Classification ===
Reading data from SMSSpamCollection...

Label Distribution:
Label 0: 4827 (86.60%)
Label 1: 747 (13.40%)

Training model:
Epoch	Loss
0	0.400881
1	0.381376
2	0.370342
3	0.360767
4	0.351853
5	0.343650
6	0.336129
7	0.329191
8	0.322815
9	0.316977
10	0.311522
11	0.306603
12	0.301992
13	0.297788
14	0.293920
15	0.290357
16	0.287081
17	0.284031
18	0.281237
19	0.278658

Training model:
Epoch	Loss
0	0.410443
1	0.391494
2	0.379805
3	0.369559
4	0.360140
5	0.351446
6	0.343471
7	0.336136
8	0.329373
9	0.323154
10	0.317463
11	0.312205
12	0.307399
13	0.302967
14	0.298888
15	0.295157
16	0.291667
17	0.288487
18	0.285563
19	0.282833

Training model:
Epoch	Loss
0	0.409711
1	0.389248
2	0.377944
3	0.368101
4	0.359042
5	0.350722
6	0.342988
7	0.335897
8	0.329375
9	0.323353
10	0.317803
11	0.312721
12	0.308035
13	0.303728
14	0.299751
15	0.296055
16	0.292677
17	0.289539
18	0.286658
19	0.284013

Training model:
Epoch	Loss
0	0.408124
1	0.387195
2	0.375940
3	0.365847
4	0.3566

TypeError: unsupported operand type(s) for *: 'int' and 'NoneType'