<a href="https://colab.research.google.com/github/Abhishek0716ss/MachineLearning-2025/blob/main/SpamBasedDecision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import urllib.request

# --- Step 1: Download the dataset automatically ---
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
urllib.request.urlretrieve(url, "spambase.data")

# --- Step 2: Define column names ---
column_names = [
    'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
    'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet',
    'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
    'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free',
    'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
    'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money',
    'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
    'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
    'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',
    'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
    'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project',
    'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_semicolon', 'char_freq_parenthesis', 'char_freq_bracket', 'char_freq_exclamation',
    'char_freq_dollar', 'char_freq_hash', 'capital_run_length_average',
    'capital_run_length_longest', 'capital_run_length_total', 'spam'
]

# --- Step 3: Load the dataset ---
df = pd.read_csv("spambase.data", header=None, names=column_names)
print(" Dataset loaded successfully!")
print(df.head())

# --- Step 4: Split into features and labels ---
X = df.drop('spam', axis=1)
y = df['spam']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

# --- Step 5: Define Bayesian Classifier ---
class BayesianClassifier:
    def __init__(self):
        self.priors = {}
        self.feature_params = {}

    def fit(self, X_train, y_train):
        classes = y_train.unique()
        for c in classes:
            self.priors[c] = np.mean(y_train == c)
            X_c = X_train[y_train == c]
            self.feature_params[c] = {
                col: {'mean': np.mean(X_c[col]), 'std': np.std(X_c[col])}
                for col in X_train.columns
            }

    def calculate_likelihood(self, x, feature_name, class_label):
        params = self.feature_params[class_label][feature_name]
        mean = params['mean']
        std = params['std']
        if std == 0:
            return 1e-9
        return norm.pdf(x, mean, std)

    def calculate_posterior(self, x_row):
        posteriors = {}
        for c, prior in self.priors.items():
            log_posterior = np.log(prior)
            for feature_name, value in x_row.items():
                likelihood = self.calculate_likelihood(value, feature_name, c)
                log_posterior += np.log(likelihood)
            posteriors[c] = log_posterior
        max_log_posterior = max(posteriors.values())
        exp_posteriors = {c: np.exp(lp - max_log_posterior) for c, lp in posteriors.items()}
        sum_exp_posteriors = sum(exp_posteriors.values())
        return {c: exp_posteriors[c] / sum_exp_posteriors for c in exp_posteriors}

    def predict(self, X_test):
        predictions = []
        for _, row in X_test.iterrows():
            posteriors = self.calculate_posterior(row)
            predicted_class = max(posteriors, key=posteriors.get)
            predictions.append(predicted_class)
        return np.array(predictions)

# --- Step 6: Train and evaluate the model ---
bayesian_model = BayesianClassifier()
bayesian_model.fit(X_train, y_train)
print(" Model training complete.")

y_pred = bayesian_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Accuracy: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)


 Dataset loaded successfully!
   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_semicolon  \
0             0.00            0.00  ...       

  log_posterior += np.log(likelihood)
  exp_posteriors = {c: np.exp(lp - max_log_posterior) for c, lp in posteriors.items()}



 Accuracy: 0.8339

Confusion Matrix:
[[420 138]
 [ 15 348]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.75      0.85       558
           1       0.72      0.96      0.82       363

    accuracy                           0.83       921
   macro avg       0.84      0.86      0.83       921
weighted avg       0.87      0.83      0.84       921

