In [10]:
import math
from collections import Counter
import random

In [13]:
import math
import random
from collections import Counter

def load_and_split_data(file_path, train_ratio=0.7, val_ratio=0.15):
    """
    Loads text data from a file and splits it into training, validation, and test sets.
    Args:
        file_path (str): Path to the data file
        train_ratio (float): Proportion of data for training
        val_ratio (float): Proportion of data for validation
    Returns:
        tuple: Contains train, validation, and test data and labels
    """
    print(f"Reading data from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    print(f"Total number of samples loaded: {len(lines)}")
    
    # Count label distribution
    label_counts = Counter(line.split('\t')[0] for line in lines)
    print("\nLabel Distribution:")
    for label, count in label_counts.items():
        print(f"{label}: {count} ({count/len(lines)*100:.2f}%)")
    
    # Set random seed for reproducibility
    random.seed(42)
    random.shuffle(lines)
    
    # Calculate split points
    train_end = int(len(lines) * train_ratio)
    val_end = int(len(lines) * (train_ratio + val_ratio))
    
    # Split data
    train_data = lines[:train_end]
    val_data = lines[train_end:val_end]
    test_data = lines[val_end:]
    
    def split_data(data):
        sentences = []
        labels = []
        for line in data:
            if '\t' in line:
                label, text = line.strip().split('\t', 1)
                # Convert 'spam' to 1 and 'ham' to 0
                label_int = 1 if label.lower() == 'spam' else 0
                labels.append(label_int)
                sentences.append(text)
        return sentences, labels
    
    return split_data(train_data) + split_data(val_data) + split_data(test_data)

def preprocess_text(text):
    """
    Cleans text by removing special characters and converting to lowercase.
    Args:
        text (str): Input text to clean
    Returns:
        str: Cleaned text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = ' '.join(word for word in text.split() 
                   if not word.startswith(('http:', 'https:', 'www.')))
    
    # Keep only letters and spaces
    cleaned_text = ""
    for char in text:
        if char.isalpha() or char.isspace():
            cleaned_text += char
    
    return cleaned_text

def tokenize(text):
    """
    Converts text into tokens, removes stopwords, and applies stemming.
    Args:
        text (str): Input text to tokenize
    Returns:
        list: List of processed tokens
    """
    stopwords = {
        "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", 
        "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", 
        "to", "was", "were", "will", "with", "the", "this", "but", "they",
        "have", "had", "what", "when", "where", "who", "which", "why", "how"
    }
    
    def simple_stem(word):
        """Simple word stemming rules"""
        if len(word) < 4:
            return word
        if word.endswith('ing'):
            return word[:-3]
        elif word.endswith('ed'):
            return word[:-2]
        elif word.endswith('s'):
            return word[:-1]
        return word
    
    words = text.split()
    return [simple_stem(word) for word in words 
            if word not in stopwords and len(word) > 1]

class TfidfVectorizer:
    """
    Converts text documents to TF-IDF feature vectors.
    """
    def __init__(self):
        self.vocabulary = set()
        self.idf = {}
        self.vocab_index = {}
    
    def fit(self, documents):
        """
        Builds vocabulary and computes IDF scores from documents
        Args:
            documents (list): List of tokenized documents
        """
        # Build vocabulary
        for doc in documents:
            self.vocabulary.update(doc)
        
        self.vocab_index = {word: idx for idx, word in enumerate(sorted(self.vocabulary))}
        
        # Compute document frequencies
        doc_freq = Counter()
        for doc in documents:
            doc_words = set(doc)
            for word in doc_words:
                doc_freq[word] += 1
        
        # Calculate IDF scores
        num_docs = len(documents)
        self.idf = {word: math.log((num_docs + 1)/(doc_freq[word] + 1)) + 1 
                   for word in self.vocabulary}
    
    def transform(self, documents):
        """
        Transforms documents into TF-IDF feature vectors
        Args:
            documents (list): List of tokenized documents
        Returns:
            list: List of TF-IDF feature vectors
        """
        X = []
        for doc in documents:
            tf = Counter(doc)
            doc_len = len(doc) if len(doc) > 0 else 1
            
            features = [0.0] * len(self.vocabulary)
            for word in set(doc):
                if word in self.vocab_index:
                    idx = self.vocab_index[word]
                    tf_val = tf[word] / doc_len
                    features[idx] = tf_val * self.idf.get(word, 0)
            
            X.append(features)
        return X

class LogisticRegression:
    def __init__(self, learning_rate=0.01, lambda_reg=0.1, num_epochs=100, 
                 early_stop_threshold=1e-4, batch_size=32):
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.num_epochs = num_epochs
        self.early_stop_threshold = early_stop_threshold
        self.batch_size = batch_size
        self.weights = None
        self.bias = 0
        self.training_history = []
    
    def sigmoid(self, z):
        """Compute sigmoid function"""
        # Clip z to prevent overflow
        z = min(max(z, -100), 100)
        return 1 / (1 + math.exp(-z))
    
    def compute_loss(self, X, y, indices):
        """Compute binary cross-entropy loss with L2 regularization"""
        loss = 0
        for i in indices:
            z = sum(X[i][j] * self.weights[j] for j in range(len(self.weights))) + self.bias
            pred = self.sigmoid(z)
            # Add small epsilon to avoid log(0)
            loss -= (y[i] * math.log(pred + 1e-15) + (1 - y[i]) * math.log(1 - pred + 1e-15))
        
        # Add L2 regularization term
        l2_term = self.lambda_reg * sum(w * w for w in self.weights)
        return (loss / len(indices)) + l2_term
    
    def compute_class_weights(self, y):
        """
        Compute class weights to handle class imbalance
        Args:
            y (list): Labels
        Returns:
            dict: Class weights
        """
        counts = Counter(y)
        total = len(y)
        weights = {
            0: total / (2 * counts[0]) if counts[0] > 0 else 1,
            1: total / (2 * counts[1]) if counts[1] > 0 else 1
        }
        print(f"Class weights: {weights}")
        return weights
    
    def train(self, X, y, method='sgd'):
        """
        Train the model using either SGD or mini-batch gradient descent
        Args:
            X (list): Feature vectors
            y (list): Labels
            method (str): 'sgd' or 'minibatch'
        """
        if not self.weights:
            # Initialize weights with small random values
            self.weights = [random.uniform(-0.1, 0.1) for _ in range(len(X[0]))]
        
        # Compute class weights for balanced training
        class_weights = self.compute_class_weights(y)
        
        print(f"\nTraining with {method.upper()}:")
        print("Epoch\tLoss\t\tΔLoss")
        
        n_samples = len(y)
        prev_loss = float('inf')
        no_improvement_count = 0
        
        for epoch in range(self.num_epochs):
            indices = list(range(n_samples))
            random.shuffle(indices)
            
            if method == 'sgd':
                batch_indices = [[i] for i in indices]
            else:
                batch_indices = [indices[i:i + self.batch_size] 
                               for i in range(0, len(indices), self.batch_size)]
            
            epoch_loss = 0
            for batch in batch_indices:
                weight_gradients = [0] * len(self.weights)
                bias_gradient = 0
                
                for i in batch:
                    # Forward pass
                    z = sum(X[i][j] * self.weights[j] for j in range(len(self.weights))) + self.bias
                    pred = self.sigmoid(z)
                    
                    # Apply class weights to error
                    sample_weight = class_weights[y[i]]
                    error = sample_weight * (pred - y[i])
                    
                    # Accumulate gradients
                    for j in range(len(self.weights)):
                        weight_gradients[j] += error * X[i][j]
                    bias_gradient += error
                
                # Apply updates with regularization
                batch_size = len(batch)
                for j in range(len(self.weights)):
                    reg_gradient = 2 * self.lambda_reg * self.weights[j]
                    self.weights[j] -= self.learning_rate * (weight_gradients[j]/batch_size + reg_gradient)
                self.bias -= self.learning_rate * (bias_gradient/batch_size)
                
                epoch_loss += self.compute_loss(X, y, batch)
            
            avg_loss = epoch_loss / len(batch_indices)
            loss_change = prev_loss - avg_loss
            self.training_history.append(avg_loss)
            
            if epoch % 5 == 0:
                print(f"{epoch}\t{avg_loss:.6f}\t{loss_change:.6f}")
            
            # Early stopping with patience
            if abs(loss_change) < self.early_stop_threshold:
                no_improvement_count += 1
                if no_improvement_count >= 3:  # Wait for 3 epochs of no improvement
                    print(f"\nEarly stopping at epoch {epoch}")
                    break
            else:
                no_improvement_count = 0
            
            prev_loss = avg_loss
    
    def predict(self, X):
        """Make predictions on new data"""
        predictions = []
        for x in X:
            z = sum(x[j] * self.weights[j] for j in range(len(self.weights))) + self.bias
            prob = self.sigmoid(z)
            predictions.append(1 if prob >= 0.5 else 0)
        return predictions

def evaluate(y_true, y_pred):
    """
    Compute evaluation metrics
    Returns:
        tuple: (accuracy, precision, recall, f1)
    """
    tp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 1 and yp == 1)
    tn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 0 and yp == 0)
    fp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 0 and yp == 1)
    fn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == 1 and yp == 0)
    
    accuracy = (tp + tn) / len(y_true)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return accuracy, precision, recall, f1

def perform_cross_validation(X, y, lambda_values, k=5):
    """
    Perform k-fold cross-validation to find the best regularization parameter
    Args:
        X (list): Feature vectors
        y (list): Labels
        lambda_values (list): List of lambda values to try
        k (int): Number of folds
    Returns:
        float: Best lambda value
    """
    print("\nPerforming cross-validation...")
    n_samples = len(y)
    fold_size = n_samples // k
    best_lambda = None
    best_score = -float('inf')
    
    for lambda_val in lambda_values:
        print(f"\nTrying lambda = {lambda_val}")
        scores = []
        
        for fold in range(k):
            # Split data into training and validation
            val_start = fold * fold_size
            val_end = val_start + fold_size
            
            X_val = X[val_start:val_end]
            y_val = y[val_start:val_end]
            X_train = X[:val_start] + X[val_end:]
            y_train = y[:val_start] + y[val_end:]
            
            # Train model
            model = LogisticRegression(lambda_reg=lambda_val, num_epochs=50)
            model.train(X_train, y_train, method='minibatch')
            
            # Evaluate
            y_pred = model.predict(X_val)
            accuracy, _, _, f1 = evaluate(y_val, y_pred)
            scores.append(f1)
        
        avg_score = sum(scores) / len(scores)
        print(f"Average F1-score: {avg_score:.4f}")
        
        if avg_score > best_score:
            best_score = avg_score
            best_lambda = lambda_val
    
    print(f"\nBest lambda value: {best_lambda} (F1-score: {best_score:.4f})")
    return best_lambda

def main():
    # Example usage with a sample input file
    # Assuming input file format: label\ttext
    file_path = "SMSSpamCollection"  # Replace with your data file path
    
    # Step 1: Load and split data
    train_texts, train_labels, val_texts, val_labels, test_texts, test_labels = load_and_split_data(file_path)
    
    # Step 2: Preprocess and tokenize texts
    print("\nPreprocessing and tokenizing texts...")
    train_tokens = [tokenize(preprocess_text(text)) for text in train_texts]
    val_tokens = [tokenize(preprocess_text(text)) for text in val_texts]
    test_tokens = [tokenize(preprocess_text(text)) for text in test_texts]
    
    # Step 3: Create TF-IDF features
    vectorizer = TfidfVectorizer()
    vectorizer.fit(train_tokens)
    
    X_train = vectorizer.transform(train_tokens)
    X_val = vectorizer.transform(val_tokens)
    X_test = vectorizer.transform(test_tokens)
    
    # Step 4: Train and evaluate with both SGD and mini-batch
    for method in ['sgd', 'minibatch']:
        print(f"\nStep 4: Training with {method.upper()}...")
        model = LogisticRegression(
            learning_rate=0.1,  # Increased learning rate
            lambda_reg=0.01,    # Reduced regularization
            num_epochs=35,
            early_stop_threshold=1e-4,
            batch_size=32
        )
        model.train(X_train, train_labels, method=method)
        
        # Evaluate on both validation and test sets
        print(f"\nStep 5: Evaluating {method.upper()} model...")
        
        # Validation set evaluation
        val_pred = model.predict(X_val)
        val_accuracy, val_precision, val_recall, val_f1 = evaluate(val_labels, val_pred)
        print(f"\n{method.upper()} Validation Results:")
        print(f"Accuracy:  {val_accuracy:.4f}")
        print(f"Precision: {val_precision:.4f}")
        print(f"Recall:    {val_recall:.4f}")
        print(f"F1-score:  {val_f1:.4f}")
        
        # Test set evaluation
        test_pred = model.predict(X_test)
        test_accuracy, test_precision, test_recall, test_f1 = evaluate(test_labels, test_pred)
        print(f"\n{method.upper()} Test Results:")
        print(f"Accuracy:  {test_accuracy:.4f}")
        print(f"Precision: {test_precision:.4f}")
        print(f"Recall:    {test_recall:.4f}")
        print(f"F1-score:  {test_f1:.4f}")

if __name__ == "__main__":
    main()

KeyboardInterrupt: 

In [None]:
import math
import random
from collections import Counter

def load_and_split_data(file_path, train_ratio=0.7, val_ratio=0.15, task='binary'):
    """Load and split data with hardcoded authors"""
    print(f"Reading data from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        if task == 'binary':
            lines = [line.strip() for line in f if line.strip() and '\t' in line]
        else:
            # Hardcoded authors
            AUTHORS = [
                'Jane Austen',
                'Arthur Conan Doyle',
                'Fyodor Dostoyevsky'
            ]
            
            lines = []
            for line in f:
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                # Check each author
                for author in AUTHORS:
                    if line.startswith(author):
                        text = line[len(author):].strip()
                        if text:  # Ensure we have text after author name
                            lines.append(f"{author}|{text}")
                        break
    
    if not lines:
        raise ValueError(f"No valid data found in {file_path}")
    
    print(f"Total number of samples loaded: {len(lines)}")
    
    # Count and display label distribution
    if task == 'binary':
        label_counts = Counter(line.split('\t')[0] for line in lines)
    else:
        label_counts = Counter(line.split('|')[0] for line in lines)
    
    print("\nLabel Distribution:")
    for label, count in label_counts.items():
        print(f"{label}: {count} ({count/len(lines)*100:.2f}%)")
    
    # Shuffle and split data
    random.seed(42)
    random.shuffle(lines)
    
    train_end = int(len(lines) * train_ratio)
    val_end = int(len(lines) * (train_ratio + val_ratio))
    
    train_data = lines[:train_end]
    val_data = lines[train_end:val_end]
    test_data = lines[val_end:]
    
    def split_data(data, task):
        sentences = []
        labels = []
        for line in data:
            if task == 'binary':
                if '\t' in line:
                    label, text = line.strip().split('\t', 1)
                    label_int = 1 if label.lower() == 'spam' else 0
                    labels.append(label_int)
                    sentences.append(text)
            else:
                if '|' in line:
                    label, text = line.strip().split('|', 1)
                    labels.append(label.strip())
                    sentences.append(text.strip())
        return sentences, labels
    
    return split_data(train_data, task) + split_data(val_data, task) + split_data(test_data, task)
def preprocess_text(text):
    """Enhanced text preprocessing"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = ' '.join(word for word in text.split() 
                   if not word.startswith(('http:', 'https:', 'www.')))
    
    # Keep only letters and spaces, replace other chars with space
    cleaned_text = ""
    for char in text:
        if char.isalpha():
            cleaned_text += char
        elif char.isspace():
            if not cleaned_text.endswith(' '):  # Avoid multiple spaces
                cleaned_text += ' '
    
    return cleaned_text.strip()

def tokenize(text):
    """Enhanced tokenization with expanded stopwords"""
    stopwords = {
        "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", 
        "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", 
        "to", "was", "were", "will", "with", "the", "this", "but", "they",
        "have", "had", "what", "when", "where", "who", "which", "why", "how",
        "all", "any", "both", "each", "few", "more", "most", "other", "some",
        "such", "than", "too", "very", "can", "into", "if", "or", "i", "my"
    }
    
    def simple_stem(word):
        """Enhanced stemming with more rules"""
        if len(word) < 4:
            return word
        
        if word.endswith('ing'):
            word = word[:-3]
        elif word.endswith('ed'):
            word = word[:-2]
        elif word.endswith('ly'):
            word = word[:-2]
        elif word.endswith('s'):
            word = word[:-1]
        
        return word
    
    words = text.split()
    return [simple_stem(word) for word in words 
            if word not in stopwords and len(word) > 1]

class TfidfVectorizer:
    """Improved TF-IDF vectorizer with better normalization"""
    def __init__(self):
        self.vocabulary = set()
        self.idf = {}
        self.vocab_index = {}
        self.min_df = 2  # Minimum document frequency
    
    def fit(self, documents):
        # Count document frequencies
        doc_freq = Counter()
        for doc in documents:
            doc_words = set(doc)  # Use set to count each word once per document
            for word in doc_words:
                doc_freq[word] += 1
        
        # Filter vocabulary based on minimum document frequency
        self.vocabulary = {word for word, freq in doc_freq.items() 
                         if freq >= self.min_df}
        
        self.vocab_index = {word: idx for idx, word in enumerate(sorted(self.vocabulary))}
        
        # Calculate IDF scores with smoothing
        num_docs = len(documents)
        self.idf = {word: math.log((num_docs + 1)/(doc_freq[word] + 1)) + 1 
                   for word in self.vocabulary}
    
    def transform(self, documents):
        X = []
        for doc in documents:
            tf = Counter(doc)
            doc_len = len(doc) if len(doc) > 0 else 1
            
            features = [0.0] * len(self.vocabulary)
            for word in set(doc):
                if word in self.vocab_index:
                    idx = self.vocab_index[word]
                    tf_val = tf[word] / doc_len
                    features[idx] = tf_val * self.idf.get(word, 0)
            
            # L2 normalize the feature vector
            norm = math.sqrt(sum(x * x for x in features) + 1e-10)  # Add small epsilon
            features = [x / norm for x in features]
            
            X.append(features)
        return X
class LogisticRegression:
    """Improved Logistic Regression with better numerical stability"""
    def __init__(self, learning_rate=0.001, lambda_reg=0.001, num_epochs=100,
                 early_stop_threshold=1e-5, batch_size=64, clip_value=5.0):
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.num_epochs = num_epochs
        self.early_stop_threshold = early_stop_threshold
        self.batch_size = batch_size
        self.clip_value = clip_value
        self.weights = None
        self.bias = 0
        self.training_history = []
        self.momentum = 0.9
        self.velocity_w = None
        self.velocity_b = 0
    
    def sigmoid(self, z):
        """Numerically stable sigmoid"""
        z = min(max(z, -100), 100)
        return 1 / (1 + math.exp(-z))
    
    def clip_gradient(self, grad):
        """Gradient clipping to prevent explosion"""
        norm = math.sqrt(sum(g * g for g in grad))
        if norm > self.clip_value:
            grad = [g * self.clip_value / norm for g in grad]
        return grad
    
    def compute_loss(self, X, y, indices):
        """Compute binary cross-entropy loss with L2 regularization"""
        loss = 0
        for i in indices:
            z = sum(X[i][j] * self.weights[j] for j in range(len(self.weights))) + self.bias
            pred = self.sigmoid(z)
            # Add small epsilon for numerical stability
            loss -= (y[i] * math.log(pred + 1e-15) + 
                    (1 - y[i]) * math.log(1 - pred + 1e-15))
        
        # L2 regularization
        l2_term = self.lambda_reg * sum(w * w for w in self.weights)
        return (loss / len(indices)) + l2_term
    
    def compute_class_weights(self, y):
        """Compute balanced class weights"""
        counts = Counter(y)
        total = len(y)
        max_count = max(counts.values())
        
        weights = {
            label: total / (len(counts) * count)
            for label, count in counts.items()
        }
        print(f"Class weights: {weights}")
        return weights
    
    def train(self, X, y, method='minibatch'):
        """Train with improved stability and momentum"""
        if not self.weights:
            n_features = len(X[0])
            # Xavier initialization
            scale = math.sqrt(2.0 / (n_features + 1))
            self.weights = [random.uniform(-scale, scale) for _ in range(n_features)]
            self.velocity_w = [0] * n_features
        
        class_weights = self.compute_class_weights(y)
        print(f"\nTraining with {method.upper()}:")
        print("Epoch\tLoss\t\tΔLoss")
        
        n_samples = len(y)
        prev_loss = float('inf')
        best_loss = float('inf')
        no_improvement_count = 0
        best_weights = None
        best_bias = None
        
        for epoch in range(self.num_epochs):
            indices = list(range(n_samples))
            random.shuffle(indices)
            
            if method == 'sgd':
                batch_indices = [[i] for i in indices]
            else:
                batch_indices = [indices[i:i + self.batch_size] 
                               for i in range(0, len(indices), self.batch_size)]
            
            if not batch_indices:  # Safety check
                continue
                
            epoch_loss = 0
            for batch in batch_indices:
                weight_gradients = [0] * len(self.weights)
                bias_gradient = 0
                
                for i in batch:
                    z = sum(X[i][j] * self.weights[j] for j in range(len(self.weights))) + self.bias
                    pred = self.sigmoid(z)
                    
                    sample_weight = class_weights[y[i]]
                    error = sample_weight * (pred - y[i])
                    
                    for j in range(len(self.weights)):
                        weight_gradients[j] += error * X[i][j]
                    bias_gradient += error
                
                # Clip gradients
                weight_gradients = self.clip_gradient(weight_gradients)
                bias_gradient = min(max(bias_gradient, -self.clip_value), self.clip_value)
                
                # Apply updates with momentum
                batch_size = len(batch)
                for j in range(len(self.weights)):
                    reg_gradient = 2 * self.lambda_reg * self.weights[j]
                    grad = weight_gradients[j]/batch_size + reg_gradient
                    self.velocity_w[j] = (self.momentum * self.velocity_w[j] - 
                                        self.learning_rate * grad)
                    self.weights[j] += self.velocity_w[j]
                
                self.velocity_b = (self.momentum * self.velocity_b - 
                                 self.learning_rate * (bias_gradient/batch_size))
                self.bias += self.velocity_b
                
                epoch_loss += self.compute_loss(X, y, batch)
            
            avg_loss = epoch_loss / len(batch_indices)
            loss_change = prev_loss - avg_loss
            self.training_history.append(avg_loss)
            
            if epoch % 5 == 0:
                print(f"{epoch}\t{avg_loss:.6f}\t{loss_change:.6f}")
            
            # Save best model
            if avg_loss < best_loss:
                best_loss = avg_loss
                best_weights = self.weights.copy()
                best_bias = self.bias
                no_improvement_count = 0
            else:
                no_improvement_count += 1
            
            # Early stopping with patience
            if no_improvement_count >= 5:
                print(f"\nEarly stopping at epoch {epoch}")
                self.weights = best_weights
                self.bias = best_bias
                break
            
            prev_loss = avg_loss
    
    def predict(self, X):
        """Make predictions"""
        predictions = []
        for x in X:
            z = sum(x[j] * self.weights[j] for j in range(len(self.weights))) + self.bias
            prob = self.sigmoid(z)
            predictions.append(1 if prob >= 0.5 else 0)
        return predictions
    
class MultiClassLogisticRegression:
    """Improved multi-class logistic regression with better stability"""
    def __init__(self, n_classes, n_features, learning_rate=0.001, lambda_reg=0.001,
                 num_epochs=100, early_stop_threshold=1e-5, batch_size=64):
        self.n_classes = n_classes
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.num_epochs = num_epochs
        self.early_stop_threshold = early_stop_threshold
        self.batch_size = batch_size
        self.weights = [[random.uniform(-0.1, 0.1) for _ in range(n_features)] 
                       for _ in range(n_classes)]
        self.biases = [0] * n_classes
        self.training_history = []
        self.momentum = 0.9
        self.velocity_w = [[0] * n_features for _ in range(n_classes)]
        self.velocity_b = [0] * n_classes
    
    def softmax(self, scores):
        """Numerically stable softmax implementation"""
        shifted_scores = [s - max(scores) for s in scores]  # Shift for numerical stability
        exp_scores = [math.exp(s) for s in shifted_scores]
        total = sum(exp_scores) + 1e-15  # Add small epsilon for stability
        return [e / total for e in exp_scores]
    
    def compute_loss(self, X, y, indices):
        """Compute cross-entropy loss with L2 regularization"""
        loss = 0
        for i in indices:
            # Compute scores for each class
            scores = [sum(X[i][j] * self.weights[c][j] for j in range(len(X[i]))) + self.biases[c]
                     for c in range(self.n_classes)]
            probs = self.softmax(scores)
            # Cross entropy loss
            true_class = y[i]
            loss -= math.log(probs[true_class] + 1e-15)
        
        # Add L2 regularization term
        l2_term = self.lambda_reg * sum(sum(w * w for w in class_weights) 
                                      for class_weights in self.weights)
        return (loss / len(indices)) + l2_term
    
    def train(self, X, y, method='minibatch'):
        """Train with improved stability and momentum"""
        n_samples = len(y)
        print(f"\nTraining with {method.upper()}:")
        print("Epoch\tLoss\t\tΔLoss")
        
        prev_loss = float('inf')
        best_loss = float('inf')
        no_improvement_count = 0
        best_weights = None
        best_biases = None
        
        for epoch in range(self.num_epochs):
            indices = list(range(n_samples))
            random.shuffle(indices)
            
            batch_indices = [indices[i:i + self.batch_size] 
                           for i in range(0, len(indices), self.batch_size)]
            
            if not batch_indices:  # Safety check
                continue
            
            epoch_loss = 0
            for batch in batch_indices:
                # Initialize gradients
                weight_gradients = [[0] * len(self.weights[0]) for _ in range(self.n_classes)]
                bias_gradients = [0] * self.n_classes
                
                for i in batch:
                    # Forward pass
                    scores = [sum(X[i][j] * self.weights[c][j] for j in range(len(X[i]))) + self.biases[c]
                            for c in range(self.n_classes)]
                    probs = self.softmax(scores)
                    
                    # Compute gradients
                    true_class = y[i]
                    for c in range(self.n_classes):
                        error = probs[c]
                        if c == true_class:
                            error -= 1
                        
                        # Accumulate gradients
                        for j in range(len(X[i])):
                            weight_gradients[c][j] += error * X[i][j]
                        bias_gradients[c] += error
                
                # Apply updates with momentum
                batch_size = len(batch)
                for c in range(self.n_classes):
                    for j in range(len(self.weights[c])):
                        reg_gradient = 2 * self.lambda_reg * self.weights[c][j]
                        grad = weight_gradients[c][j]/batch_size + reg_gradient
                        self.velocity_w[c][j] = (self.momentum * self.velocity_w[c][j] - 
                                               self.learning_rate * grad)
                        self.weights[c][j] += self.velocity_w[c][j]
                    
                    self.velocity_b[c] = (self.momentum * self.velocity_b[c] - 
                                        self.learning_rate * (bias_gradients[c]/batch_size))
                    self.biases[c] += self.velocity_b[c]
                
                batch_loss = self.compute_loss(X, y, batch)
                if not math.isnan(batch_loss):  # Skip invalid losses
                    epoch_loss += batch_loss
            
            if batch_indices:  # Ensure we don't divide by zero
                avg_loss = epoch_loss / len(batch_indices)
                loss_change = prev_loss - avg_loss
                self.training_history.append(avg_loss)
                
                if epoch % 5 == 0:
                    print(f"{epoch}\t{avg_loss:.6f}\t{loss_change:.6f}")
                
                # Save best model
                if avg_loss < best_loss:
                    best_loss = avg_loss
                    best_weights = [w[:] for w in self.weights]
                    best_biases = self.biases[:]
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1
                
                # Early stopping with patience
                if no_improvement_count >= 5:
                    print(f"\nEarly stopping at epoch {epoch}")
                    self.weights = best_weights
                    self.biases = best_biases
                    break
                
                prev_loss = avg_loss
    
    def predict(self, X):
        """Make predictions"""
        predictions = []
        for x in X:
            scores = [sum(x[j] * self.weights[c][j] for j in range(len(x))) + self.biases[c]
                     for c in range(self.n_classes)]
            predictions.append(scores.index(max(scores)))
        return predictions

def evaluate_multiclass(y_true, y_pred, classes):
    """Compute evaluation metrics for multi-class classification"""
    if not y_true or not y_pred:
        raise ValueError("Empty prediction or ground truth arrays")
    
    if len(y_true) != len(y_pred):
        raise ValueError(f"Length mismatch: y_true ({len(y_true)}) != y_pred ({len(y_pred)})")
    
    n_classes = len(classes)
    if n_classes < 2:
        raise ValueError(f"Need at least 2 classes, got {n_classes}")
    
    confusion = [[0] * n_classes for _ in range(n_classes)]
    
    for yt, yp in zip(y_true, y_pred):
        if not (0 <= yt < n_classes) or not (0 <= yp < n_classes):
            raise ValueError(f"Invalid class indices: true={yt}, pred={yp}")
        confusion[yt][yp] += 1
    
    total_samples = len(y_true)
    if total_samples == 0:
        raise ValueError("No samples to evaluate")
        
    accuracy = sum(confusion[i][i] for i in range(n_classes)) / total_samples
    metrics = {'accuracy': accuracy, 'per_class': {}}
    
    for i, class_name in enumerate(classes):
        tp = confusion[i][i]
        fp = sum(confusion[j][i] for j in range(n_classes)) - tp
        fn = sum(confusion[i][j] for j in range(n_classes)) - tp
        
        # Add small epsilon to prevent division by zero
        epsilon = 1e-10
        precision = tp / (tp + fp + epsilon)
        recall = tp / (tp + fn + epsilon)
        f1 = 2 * (precision * recall) / (precision + recall + epsilon)
        
        metrics['per_class'][class_name] = {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
    return metrics

def main():
    # Binary classification (SMS spam)
    file_path = "SMSSpamCollection"
    train_texts, train_labels, val_texts, val_labels, test_texts, test_labels = load_and_split_data(file_path)
    
    print("\nPreprocessing and tokenizing texts...")
    train_tokens = [tokenize(preprocess_text(text)) for text in train_texts]
    val_tokens = [tokenize(preprocess_text(text)) for text in val_texts]
    test_tokens = [tokenize(preprocess_text(text)) for text in test_texts]
    
    vectorizer = TfidfVectorizer()
    vectorizer.fit(train_tokens)
    
    X_train = vectorizer.transform(train_tokens)
    X_val = vectorizer.transform(val_tokens)
    X_test = vectorizer.transform(test_tokens)
    
    for method in ['sgd', 'minibatch']:
        print(f"\nTraining with {method.upper()}...")
        model = LogisticRegression(
            learning_rate=0.001,
            lambda_reg=0.001,
            num_epochs=100,
            early_stop_threshold=1e-5,
            batch_size=64
        )
        model.train(X_train, train_labels, method=method)
        
        # Validation set evaluation
        val_pred = model.predict(X_val)
        val_accuracy, val_precision, val_recall, val_f1 = evaluate(val_labels, val_pred)
        print(f"\n{method.upper()} Validation Results:")
        print(f"Accuracy:  {val_accuracy:.4f}")
        print(f"Precision: {val_precision:.4f}")
        print(f"Recall:    {val_recall:.4f}")
        print(f"F1-score:  {val_f1:.4f}")
        
        # Test set evaluation
        test_pred = model.predict(X_test)
        test_accuracy, test_precision, test_recall, test_f1 = evaluate(test_labels, test_pred)
        print(f"\n{method.upper()} Test Results:")
        print(f"Accuracy:  {test_accuracy:.4f}")
        print(f"Precision: {test_precision:.4f}")
        print(f"Recall:    {test_recall:.4f}")
        print(f"F1-score:  {test_f1:.4f}")

def main_multiclass():
    """Main function for multi-class author classification"""
    file_path = "books.txt"
    
    # Step 1: Load and split data
    train_texts, train_labels, val_texts, val_labels, test_texts, test_labels = \
        load_and_split_data(file_path, task='multiclass')
    
    # Get unique classes and create label mapping
    classes = sorted(set(train_labels))
    label_to_idx = {label: idx for idx, label in enumerate(classes)}
    
    # Convert string labels to indices
    train_labels = [label_to_idx[label] for label in train_labels]
    val_labels = [label_to_idx[label] for label in val_labels]
    test_labels = [label_to_idx[label] for label in test_labels]
    
    # Step 2: Preprocess and tokenize texts
    print("\nPreprocessing and tokenizing texts...")
    train_tokens = [tokenize(preprocess_text(text)) for text in train_texts]
    val_tokens = [tokenize(preprocess_text(text)) for text in val_texts]
    test_tokens = [tokenize(preprocess_text(text)) for text in test_texts]
    
    # Step 3: Create TF-IDF features
    vectorizer = TfidfVectorizer()
    vectorizer.fit(train_tokens)
    
    X_train = vectorizer.transform(train_tokens)
    X_val = vectorizer.transform(val_tokens)
    X_test = vectorizer.transform(test_tokens)
    
    # Step 4: Train model
    print("\nTraining multi-class logistic regression...")
    model = MultiClassLogisticRegression(
        n_classes=len(classes),
        n_features=len(vectorizer.vocabulary),
        learning_rate=0.001,
        lambda_reg=0.001,
        num_epochs=100,
        batch_size=64
    )
    
    model.train(X_train, train_labels, method='minibatch')
    
    # Step 5: Evaluate
    print("\nEvaluating model...")
    
    # Validation set evaluation
    val_pred = model.predict(X_val)
    val_metrics = evaluate_multiclass(val_labels, val_pred, classes)
    print("\nValidation Results:")
    print(f"Accuracy: {val_metrics['accuracy']:.4f}")
    for class_name, metrics in val_metrics['per_class'].items():
        print(f"\n{class_name}:")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall:    {metrics['recall']:.4f}")
        print(f"F1-score:  {metrics['f1']:.4f}")
    
    # Test set evaluation
    test_pred = model.predict(X_test)
    test_metrics = evaluate_multiclass(test_labels, test_pred, classes)
    print("\nTest Results:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    for class_name, metrics in test_metrics['per_class'].items():
        print(f"\n{class_name}:")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall:    {metrics['recall']:.4f}")
        print(f"F1-score:  {metrics['f1']:.4f}")

if __name__ == "__main__":
    # Run binary classification
    main()
    # Run multi-class classification
    main_multiclass()

Reading data from SMSSpamCollection...
Total number of samples loaded: 5574

Label Distribution:
ham: 4827 (86.60%)
spam: 747 (13.40%)

Preprocessing and tokenizing texts...

Training with SGD...
Class weights: {0: 0.5758783584292885, 1: 3.794747081712062}

Training with SGD:
Epoch	Loss		ΔLoss
0	0.658678	inf
5	0.485584	0.010013
10	0.454077	-0.003496
15	0.445167	0.002118
20	0.443342	0.000229
25	0.441141	-0.000934
30	0.442649	-0.001832

Early stopping at epoch 31

SGD Validation Results:
Accuracy:  0.9510
Precision: 0.7939
Recall:    0.8814
F1-score:  0.8353

SGD Test Results:
Accuracy:  0.9630
Precision: 0.8333
Recall:    0.9130
F1-score:  0.8714

Training with MINIBATCH...
Class weights: {0: 0.5758783584292885, 1: 3.794747081712062}

Training with MINIBATCH:
Epoch	Loss		ΔLoss
0	0.693433	inf
5	0.687035	0.001509
10	0.682705	0.001094
15	0.678016	0.001670
20	0.671605	0.001584
25	0.667312	0.001235
30	0.663232	0.000150
35	0.658947	0.000840
40	0.653376	0.001105
45	0.649089	-0.000153
50	0.6422