In [18]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
import matplotlib.pyplot as plt
from scipy import sparse

In [1]:
def calculate_metrics(y_true, y_pred, dataset_name=""):
    # Calculate confusion matrix components
    TP = np.sum((y_pred == 'spam') & (y_true == 'spam'))
    FP = np.sum((y_pred == 'spam') & (y_true == 'ham'))
    TN = np.sum((y_pred == 'ham') & (y_true == 'ham'))
    FN = np.sum((y_pred == 'ham') & (y_true == 'spam'))
    
    # Calculate metrics
    accuracy = np.mean(y_pred == y_true)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    # Print results
    print(f"Metrics for {dataset_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print("\n")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'confusion_matrix': {
            'TP': TP,
            'FP': FP,
            'TN': TN,
            'FN': FN
        }
    }

# Tập dữ liệu Enron-Spam

In [None]:
df_train = pd.read_csv('Data/train.csv')
df_val = pd.read_csv('Data/val.csv')

df_train.drop(columns=['Message ID', 'Unnamed: 0', 'split'], inplace=True)
df_val.drop(columns=['Message ID', 'Unnamed: 0', 'split'], inplace=True)

In [None]:
df_train.info()

In [None]:
df_train.head()

# Tiền xử lý dữ liệu

In [None]:
print(df_train['Spam/Ham'].value_counts())

In [None]:
print(f"Number of rows: {len(df_train)}")
print(f"Number of rows: {len(df_val)}")

1. Phân tích dữ liệu thiếu

In [None]:
print("\nMissing values in training set:")
print(df_train.isnull().sum())
print("\nMissing values in validation set:")
print(df_val.isnull().sum())

2. Xử lý dữ liệu thiếu

In [9]:
df_train['Subject'] = df_train['Subject'].fillna('')
df_train['Message'] = df_train['Message'].fillna('')
df_val['Subject'] = df_val['Subject'].fillna('')
df_val['Message'] = df_val['Message'].fillna('')

3. Kết hợp Subject và Message

In [10]:
df_train['text'] = df_train['Subject'] + ' ' + df_train['Message']
df_val['text'] = df_val['Subject'] + ' ' + df_val['Message']

4. Chuyển dữ liệu về dạng vector BoW

In [49]:
bow = CountVectorizer(stop_words='english')

In [50]:
X_train = bow.fit_transform(df_train['text'])
X_val = bow.transform(df_val['text'])

y_train = df_train['Spam/Ham']
y_val = df_val['Spam/Ham']

# Naive Bayes Classifier

## Hiện thực mô hình

In [14]:
class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  
        self.class_priors = None
        self.feature_probs = None
        self.classes = None
        
    def fit(self, X, y):
        if X.size == 0:
            raise ValueError("Input array X is empty")
        if len(X.shape) == 1:
            X = X.reshape(1, -1)
            
        n_samples = X.shape[0]
        n_features = X.shape[1]
            
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        self.class_priors = np.zeros(n_classes)
        for i, c in enumerate(self.classes):
            self.class_priors[i] = np.sum(y == c) / n_samples
            

        self.feature_probs = np.zeros((n_classes, n_features))
        for i, c in enumerate(self.classes):
            class_indices = np.where(y == c)[0]
            
            feature_counts = np.zeros(n_features) + self.alpha
            batch_size = 1000
            
            for start_idx in range(0, len(class_indices), batch_size):
                end_idx = min(start_idx + batch_size, len(class_indices))
                batch_indices = class_indices[start_idx:end_idx]
                
                if isinstance(X, np.ndarray):
                    batch_sum = X[batch_indices].sum(axis=0)
                else:
                    batch_sum = X[batch_indices].toarray().sum(axis=0)
                    
                feature_counts += batch_sum
                
            total_counts = feature_counts.sum()
            self.feature_probs[i] = feature_counts / total_counts
            
    def predict(self, X):
        if len(X.shape) == 1:
            X = X.reshape(1, -1)
            
        predictions = []
        batch_size = 1000
        
        for start_idx in range(0, X.shape[0], batch_size):
            end_idx = min(start_idx + batch_size, X.shape[0])
            if isinstance(X, np.ndarray):
                batch = X[start_idx:end_idx]
            else:
                batch = X[start_idx:end_idx].toarray()
                
            batch_predictions = np.array([self._predict_single(x) for x in batch])
            predictions.extend(batch_predictions)
            
        return np.array(predictions)
    
    def _predict_single(self, x):
        log_probs = np.log(self.class_priors)
        
        for i in range(len(self.classes)):
            present_features = x > 0
            if np.any(present_features):
                log_probs[i] += np.sum(np.log(self.feature_probs[i][present_features]) * x[present_features])
        
        return self.classes[np.argmax(log_probs)]

## Huấn luyện mô hình

In [30]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

## Đánh giá mô hình

In [None]:
calculate_metrics(y_train, nb.predict(X_train), "Training")
print("--------------------------------")
calculate_metrics(y_val, nb.predict(X_val), "Validation")

# Logistic Regression

## Hiện thực mô hình

In [21]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iter=1000, batch_size=1000, tol=1e-6):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.batch_size = batch_size
        self.tol = tol
        self.weights = None
        self.bias = None
        self.loss_history = []  
        
    def _sigmoid(self, z):
        z = np.clip(z, -250, 250)
        return 1 / (1 + np.exp(-z))
    
    def _get_batches(self, X, y, batch_size):
        n_samples = X.shape[0]
        for i in range(0, n_samples, batch_size):
            end_idx = min(i + batch_size, n_samples)
            if sparse.issparse(X):
                yield X[i:end_idx], y[i:end_idx]
            else:
                yield X[i:end_idx], y[i:end_idx]
    
    def fit(self, X, y, X_val=None, y_val=None):
        if isinstance(y.iloc[0], str):
            y = (y == 'spam').astype(int)
        
        if X_val is not None and y_val is not None:
            if isinstance(y_val.iloc[0], str):
                y_val = (y_val == 'spam').astype(int)
            if hasattr(y_val, 'values'):
                y_val = y_val.values
        
        n_samples, n_features = X.shape
        
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        if hasattr(y, 'values'):
            y = y.values
            
        prev_cost = float('inf')
        self.loss_history = []  
        self.val_loss_history = [] 
        
        for iteration in range(self.max_iter):
            total_cost = 0
            n_batches = 0
            
            indices = np.random.permutation(n_samples)
            if sparse.issparse(X):
                X_shuffled = X[indices]
            else:
                X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            for X_batch, y_batch in self._get_batches(X_shuffled, y_shuffled, self.batch_size):
                batch_size = X_batch.shape[0]
                
                if sparse.issparse(X_batch):
                    z = X_batch.dot(self.weights) + self.bias
                else:
                    z = np.dot(X_batch, self.weights) + self.bias
                
                predictions = self._sigmoid(z)
                
                epsilon = 1e-15
                predictions = np.clip(predictions, epsilon, 1 - epsilon)
                batch_cost = -np.mean(y_batch * np.log(predictions) + 
                                    (1 - y_batch) * np.log(1 - predictions))
                total_cost += batch_cost * batch_size
                n_batches += batch_size
                
                dz = predictions - y_batch
                if sparse.issparse(X_batch):
                    dw = X_batch.T.dot(dz) / batch_size
                else:
                    dw = np.dot(X_batch.T, dz) / batch_size
                db = np.mean(dz)
                
                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db
            
            avg_cost = total_cost / n_batches
            self.loss_history.append(avg_cost)
            
            if X_val is not None and y_val is not None:
                val_loss = self._calculate_loss(X_val, y_val)
                self.val_loss_history.append(val_loss)
            
            if abs(prev_cost - avg_cost) < self.tol:
                print(f"Converged at iteration {iteration + 1}")
                break
                
            prev_cost = avg_cost
            
            if (iteration + 1) % 100 == 0:
                val_info = f", Val Loss: {self.val_loss_history[-1]:.6f}" if X_val is not None else ""
                print(f"Iteration {iteration + 1}, Train Loss: {avg_cost:.6f}{val_info}")
    
    def _calculate_loss(self, X, y):
        if sparse.issparse(X):
            z = X.dot(self.weights) + self.bias
        else:
            z = np.dot(X, self.weights) + self.bias
        
        predictions = self._sigmoid(z)
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        
        return -np.mean(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
    
    def predict_proba(self, X):
        n_samples = X.shape[0]
        probabilities = np.zeros(n_samples)
        
        start_idx = 0
        for X_batch, _ in self._get_batches(X, np.zeros(n_samples), self.batch_size):
            batch_size = X_batch.shape[0]
            
            if sparse.issparse(X_batch):
                z = X_batch.dot(self.weights) + self.bias
            else:
                z = np.dot(X_batch, self.weights) + self.bias
            
            batch_proba = self._sigmoid(z)
            probabilities[start_idx:start_idx + batch_size] = batch_proba
            start_idx += batch_size
            
        return probabilities
    
    def predict(self, X):
        probabilities = self.predict_proba(X)
        return (probabilities >= 0.5).astype(int)
    
    def plot_loss(self, figsize=(12, 5)):
        if not self.loss_history:
            print("No loss history available. Train the model first.")
            return
        
        if len(self.val_loss_history) > 0:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
            
            ax1.plot(self.loss_history, label='Training Loss', color='blue', linewidth=2)
            ax1.plot(self.val_loss_history, label='Validation Loss', color='red', linewidth=2)
            ax1.set_xlabel('Iteration')
            ax1.set_ylabel('Loss')
            ax1.set_title('Training vs Validation Loss')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            
            ax2.semilogy(self.loss_history, label='Training Loss', color='blue', linewidth=2)
            ax2.semilogy(self.val_loss_history, label='Validation Loss', color='red', linewidth=2)
            ax2.set_xlabel('Iteration')
            ax2.set_ylabel('Loss (log scale)')
            ax2.set_title('Loss Convergence (Log Scale)')
            ax2.legend()
            ax2.grid(True, alpha=0.3)
        else:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
            
            ax1.plot(self.loss_history, color='blue', linewidth=2)
            ax1.set_xlabel('Iteration')
            ax1.set_ylabel('Loss')
            ax1.set_title('Training Loss')
            ax1.grid(True, alpha=0.3)
            
            ax2.semilogy(self.loss_history, color='blue', linewidth=2)
            ax2.set_xlabel('Iteration')
            ax2.set_ylabel('Loss (log scale)')
            ax2.set_title('Training Loss (Log Scale)')
            ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        print(f"Initial Loss: {self.loss_history[0]:.6f}")
        print(f"Final Loss: {self.loss_history[-1]:.6f}")
        print(f"Loss Reduction: {((self.loss_history[0] - self.loss_history[-1]) / self.loss_history[0] * 100):.2f}%")
        print(f"Total Iterations: {len(self.loss_history)}")

## Huấn luyện mô hình

In [None]:
model = LogisticRegression(learning_rate=0.1, max_iter=1500, batch_size=1000)
model.fit(X_train, y_train, X_val, y_val)

In [None]:
model.plot_loss()

## Đánh giá mô hình

In [None]:
_ = calculate_metrics(y_train, np.where(model.predict(X_train) == 1, 'spam', 'ham'), "Training")
print("--------------------------------")
_ = calculate_metrics(y_val, np.where(model.predict(X_val) == 1, 'spam', 'ham'), "Validation")

# Thử nghiệm thực tế

In [103]:
def preprocess_text(subject='', message=''):
    if pd.isnull(subject):
        subject = ''
    if pd.isnull(message):
        message = ''
    return subject + ' ' + message

def predict_email(subject, message, vectorizer, model):
    """
    Predict spam/ham for a single email.
    Args:
        subject (str): Email subject
        message (str): Email body
        vectorizer (CountVectorizer): Fitted vectorizer
        model (LogisticRegression): Trained model
    Returns:
        int: 1 for spam, 0 for ham
    """
    text = preprocess_text(subject, message)
    X = vectorizer.transform([text])
    pred = model.predict(X)[0]
    if type(pred) == np.int32:
        return "spam" if pred == 1 else "ham"
    else:
        return pred

def predict_csv(csv_path, vectorizer, model, label_col='Spam/Ham'):
    """
    Predict and evaluate on a CSV file with columns: Subject, Message, Spam/Ham.
    Args:
        csv_path (str): Path to CSV file
        vectorizer (CountVectorizer): Fitted vectorizer
        model (LogisticRegression): Trained model
        label_col (str): Name of label column
    Returns:
        dict: Metrics (accuracy, precision, recall, f1_score, confusion_matrix)
    """
    df = pd.read_csv(csv_path)
    df['Subject'] = df['Subject'].fillna('')
    df['Message'] = df['Message'].fillna('')
    df['text'] = df['Subject'] + ' ' + df['Message']
    X = vectorizer.transform(df['text'])
    y_true = df[label_col].values
    y_pred = model.predict(X)
    # Convert numeric prediction to label if needed
    if y_pred[0].dtype == np.int32:
        y_pred_label = ['spam' if p == 1 else 'ham' for p in y_pred]
    else:
        y_pred_label = y_pred
    # Simple metrics
    y_pred_label = np.asarray(y_pred_label)
    TP = np.sum((y_pred_label == 'spam') & (y_true == 'spam'))
    FP = np.sum((y_pred_label == 'spam') & (y_true == 'ham'))
    TN = np.sum((y_pred_label == 'ham') & (y_true == 'ham'))
    FN = np.sum((y_pred_label == 'ham') & (y_true == 'spam'))
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")


## Đánh giá mô hình

## Thử nghiệm với 1 email

In [53]:
import pickle
lr_model = pickle.load(open('Models/lr_model.pkl', 'rb'))
nb_model = pickle.load(open('Models/nb_model.pkl', 'rb'))
bow = pickle.load(open('Models/BoW.pkl', 'rb'))

In [None]:
SUBJECT = None
MESSAGE = None

if not SUBJECT and not MESSAGE:
    SUBJECT = "Hello"
    MESSAGE = "This is a test message"

print("Predicting email with Naive Bayes Classifier: ", end="")
print(predict_email(SUBJECT, MESSAGE, bow, nb_model))
print("--------------------------------")
print("Predicting email with Logistic Regression Classifier: ", end="")
print(predict_email(SUBJECT, MESSAGE, bow, lr_model))
print("--------------------------------")

## Thử nghiệm với 1 file csv

In [71]:
import pickle
lr_model = pickle.load(open('Models/lr_model.pkl', 'rb'))
nb_model = pickle.load(open('Models/nb_model.pkl', 'rb'))
bow = pickle.load(open('Models/BoW.pkl', 'rb'))

In [None]:
CSV_PATH = None

if not CSV_PATH:
    CSV_PATH = "Data/val.csv"

print("Predicting email with Logistic Regression Classifier")
predict_csv(CSV_PATH, bow, lr_model)
print("--------------------------------")
print("Predicting email with Naive Bayes Classifier")
predict_csv(CSV_PATH, bow, nb_model)
print("--------------------------------")