In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

# Load dataset
file_path = r'C:\\Users\\anand\\Downloads\\minorprjct\\Data_set\\mail_data.csv'
mail_data = pd.read_csv(file_path)

# Check for missing values and convert labels to binary
mail_data['Category'] = mail_data['Category'].map({'ham': 0, 'spam': 1})

# Split data into features and target
X = mail_data['Message']
y = mail_data['Category']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Function to preprocess sparse data using StandardScaler (for Logistic Regression)
def preprocess_data(data):
    scaler = StandardScaler(with_mean=False)  # Avoid dense conversion for sparse data
    return scaler.fit_transform(data)

# Scaling the TF-IDF data (for Logistic Regression)
X_train_scaled = preprocess_data(X_train_tfidf)
X_test_scaled = preprocess_data(X_test_tfidf)

# --- Custom Logistic Regression using PyTorch ---
class LogisticRegressionModel:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def fit(self, X, y):
        X = torch.tensor(X.toarray(), dtype=torch.float32).to(self.device)
        y = torch.tensor(y.values, dtype=torch.float32).to(self.device)
        n_samples, n_features = X.shape
        
        # Initialize weights and bias on the correct device
        self.weights = torch.zeros(n_features, dtype=torch.float32, device=self.device, requires_grad=True)
        self.bias = torch.zeros(1, dtype=torch.float32, device=self.device, requires_grad=True)

        # Optimizer and loss function
        optimizer = optim.SGD([self.weights, self.bias], lr=self.learning_rate)
        criterion = nn.BCELoss()  # Binary Cross-Entropy loss

        # Training loop
        for _ in range(self.n_iters):
            linear_model = torch.matmul(X, self.weights) + self.bias
            predictions = torch.sigmoid(linear_model)
            loss = criterion(predictions, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    def predict(self, X):
        X = torch.tensor(X.toarray(), dtype=torch.float32).to(self.device)
        linear_model = torch.matmul(X, self.weights) + self.bias
        predictions = torch.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in predictions.cpu().detach().numpy()]

# --- Custom Naive Bayes ---
class NaiveBayesModel:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_log_prior_ = None
        self.feature_log_prob_ = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        # Initialize log-prior and log-likelihood probabilities
        self.class_log_prior_ = np.zeros(n_classes)
        self.feature_log_prob_ = np.zeros((n_classes, n_features))
        
        # Loop over each class and compute the likelihood and prior
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.class_log_prior_[idx] = np.log(X_c.shape[0] / n_samples)
            total_word_count = X_c.sum(axis=0) + self.alpha
            total_class_word_count = total_word_count.sum()
            self.feature_log_prob_[idx, :] = np.log(total_word_count / total_class_word_count)

    def predict(self, X):
        log_probs = (X @ self.feature_log_prob_.T) + self.class_log_prior_
        return np.argmax(log_probs, axis=1)

# --- Hyperparameter Tuning for Custom Logistic Regression ---
log_reg_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'n_iters': [1000, 2000, 5000]
}

def tune_logistic_regression(X_train, y_train, X_test, y_test):
    best_acc = 0
    best_params = {}
    for lr in log_reg_grid['learning_rate']:
        for n_iter in log_reg_grid['n_iters']:
            model = LogisticRegressionModel(learning_rate=lr, n_iters=n_iter)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            print(f"Logistic Regression - Learning Rate: {lr}, Iterations: {n_iter}, Accuracy: {accuracy * 100:.2f}%")
            if accuracy > best_acc:
                best_acc = accuracy
                best_params = {'learning_rate': lr, 'n_iters': n_iter}
    print(f"Best Logistic Regression Parameters: {best_params}, Accuracy: {best_acc * 100:.2f}%")
    return best_params

# --- Hyperparameter Tuning for Custom Naive Bayes ---
nb_grid = {'alpha': [0.01, 0.1, 0.5, 1, 5]}

def tune_naive_bayes(X_train, y_train, X_test, y_test):
    best_acc = 0
    best_params = {}
    for alpha in nb_grid['alpha']:
        model = NaiveBayesModel(alpha=alpha)
        model.fit(X_train.toarray(), y_train)
        y_pred = model.predict(X_test.toarray())
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Naive Bayes - Alpha: {alpha}, Accuracy: {accuracy * 100:.2f}%")
        if accuracy > best_acc:
            best_acc = accuracy
            best_params = {'alpha': alpha}
    print(f"Best Naive Bayes Parameters: {best_params}, Accuracy: {best_acc * 100:.2f}%")
    return best_params

# --- Main Execution ---
# Tune Logistic Regression
print("Tuning Custom Logistic Regression (GPU enabled if available)...")
best_log_reg_params = tune_logistic_regression(X_train_scaled, y_train, X_test_scaled, y_test)

# Tune Naive Bayes
print("\nTuning Custom Naive Bayes...")
best_nb_params = tune_naive_bayes(X_train_tfidf, y_train, X_test_tfidf, y_test)


Tuning Custom Logistic Regression (GPU enabled if available)...
Logistic Regression - Learning Rate: 0.001, Iterations: 1000, Accuracy: 97.58%
Logistic Regression - Learning Rate: 0.001, Iterations: 2000, Accuracy: 98.30%
Logistic Regression - Learning Rate: 0.001, Iterations: 5000, Accuracy: 98.74%
Logistic Regression - Learning Rate: 0.01, Iterations: 1000, Accuracy: 98.83%
Logistic Regression - Learning Rate: 0.01, Iterations: 2000, Accuracy: 98.74%
Logistic Regression - Learning Rate: 0.01, Iterations: 5000, Accuracy: 98.57%
Logistic Regression - Learning Rate: 0.1, Iterations: 1000, Accuracy: 98.65%
Logistic Regression - Learning Rate: 0.1, Iterations: 2000, Accuracy: 98.74%
Logistic Regression - Learning Rate: 0.1, Iterations: 5000, Accuracy: 98.48%
Best Logistic Regression Parameters: {'learning_rate': 0.01, 'n_iters': 1000}, Accuracy: 98.83%

Tuning Custom Naive Bayes...
Naive Bayes - Alpha: 0.01, Accuracy: 98.57%
Naive Bayes - Alpha: 0.1, Accuracy: 98.65%
Naive Bayes - Alpha: 