<a href="https://colab.research.google.com/github/Amogh-S-Acharya/Fifth_Sem_ML_Lab/blob/main/spambase_decision_theory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# Column names derived from spambase.names and spambase.DOCUMENTATION
column_names = [
    'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
    'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet',
    'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
    'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free',
    'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
    'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money',
    'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
    'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
    'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',
    'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
    'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project',
    'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_semicolon', 'char_freq_parenthesis', 'char_freq_bracket', 'char_freq_exclamation',
    'char_freq_dollar', 'char_freq_hash', 'capital_run_length_average',
    'capital_run_length_longest', 'capital_run_length_total', 'spam'
]

# Read the data into a pandas DataFrame
df = pd.read_csv('/content/spambase.data', header=None, names=column_names)

# Display the first few rows
display(df.head())

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_semicolon,char_freq_parenthesis,char_freq_bracket,char_freq_exclamation,char_freq_dollar,char_freq_hash,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop('spam', axis=1)
y = df['spam']

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (3680, 57) (3680,)
Testing set shape: (921, 57) (921,)


In [4]:
import numpy as np
from scipy.stats import norm

class BayesianClassifier:
    def __init__(self):
        self.priors = {}
        self.feature_params = {} # Stores mean and std for each feature for each class

    def fit(self, X_train, y_train):
        """
        Trains the Bayesian classifier by calculating priors and feature parameters.

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series): Training target variable.
        """
        classes = y_train.unique()
        for c in classes:
            # Calculate prior probability for each class
            self.priors[c] = np.mean(y_train == c)

            # Calculate mean and standard deviation for each feature for the current class
            X_c = X_train[y_train == c]
            self.feature_params[c] = {
                col: {'mean': np.mean(X_c[col]), 'std': np.std(X_c[col])}
                for col in X_train.columns
            }

    def calculate_likelihood(self, x, feature_name, class_label):
        """
        Calculates the likelihood of a feature value given a class using a Gaussian distribution.

        Args:
            x (float): The feature value.
            feature_name (str): The name of the feature.
            class_label (int): The class label (0 for non-spam, 1 for spam).

        Returns:
            float: The likelihood probability.
        """
        params = self.feature_params[class_label][feature_name]
        mean = params['mean']
        std = params['std']

        # Avoid division by zero for standard deviation
        if std == 0:
            return 1e-9 # Return a small probability if std is zero
        return norm.pdf(x, mean, std)


    def calculate_posterior(self, x_row):
        """
        Calculates the posterior probability for each class for a given data point.

        Args:
            x_row (pd.Series): A single data point (row of features).

        Returns:
            dict: A dictionary with class labels as keys and posterior probabilities as values.
        """
        posteriors = {}
        for c, prior in self.priors.items():
            # Start with the log of the prior to avoid underflow
            log_posterior = np.log(prior)
            for feature_name, value in x_row.items():
                likelihood = self.calculate_likelihood(value, feature_name, c)
                # Add the log of the likelihood
                log_posterior += np.log(likelihood)
            posteriors[c] = log_posterior

        # Convert log posteriors back to probabilities and normalize
        # Using log-sum-exp trick for numerical stability
        max_log_posterior = max(posteriors.values())
        exp_posteriors = {c: np.exp(lp - max_log_posterior) for c, lp in posteriors.items()}
        sum_exp_posteriors = sum(exp_posteriors.values())

        return {c: exp_posteriors[c] / sum_exp_posteriors for c in exp_posteriors}

    def predict(self, X_test):
        """
        Predicts the class labels for the test data.

        Args:
            X_test (pd.DataFrame): Testing features.

        Returns:
            np.ndarray: Predicted class labels.
        """
        predictions = []
        for index, row in X_test.iterrows():
            posteriors = self.calculate_posterior(row)
            # Classify based on the highest posterior probability
            predicted_class = max(posteriors, key=posteriors.get)
            predictions.append(predicted_class)
        return np.array(predictions)

# Instantiate and train the classifier
bayesian_model = BayesianClassifier()
bayesian_model.fit(X_train, y_train)

print("Model training complete.")

Model training complete.


In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = bayesian_model.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate and print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Calculate and print classification report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

  log_posterior += np.log(likelihood)
  exp_posteriors = {c: np.exp(lp - max_log_posterior) for c, lp in posteriors.items()}


Accuracy: 0.8339

Confusion Matrix:
[[420 138]
 [ 15 348]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.75      0.85       558
           1       0.72      0.96      0.82       363

    accuracy                           0.83       921
   macro avg       0.84      0.86      0.83       921
weighted avg       0.87      0.83      0.84       921

