<a href="https://colab.research.google.com/github/AryanSathish3/Machinelearning25/blob/main/Spambdt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Open in Colab - corrected and ready to run

import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Column names (from spambase)
column_names = [
    'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
    'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet',
    'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
    'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free',
    'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
    'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money',
    'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
    'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
    'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',
    'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
    'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project',
    'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_semicolon', 'char_freq_parenthesis', 'char_freq_bracket', 'char_freq_exclamation',
    'char_freq_dollar', 'char_freq_hash', 'capital_run_length_average',
    'capital_run_length_longest', 'capital_run_length_total', 'spam'
]

# Load dataset (update path if necessary)
df = pd.read_csv('/content/spambase.data', header=None, names=column_names)
print("Loaded dataset with shape:", df.shape)
print(df.head())

# Features and target
X = df.drop(columns='spam')
y = df['spam']

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# -------------------------
# Gaussian Naive Bayes-like Bayesian Classifier
# -------------------------
class BayesianClassifier:
    def __init__(self, eps_std=1e-6, eps_likelihood=1e-12):
        self.priors = {}            # prior probability per class
        self.feature_params = {}    # {class: {feature: {'mean':..., 'std':...}}}
        self.eps_std = eps_std
        self.eps_likelihood = eps_likelihood

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        classes = np.unique(y_train)
        for c in classes:
            # prior: proportion of samples in class c
            mask = (y_train.values == c)
            X_c = X_train.iloc[mask]
            self.priors[int(c)] = X_c.shape[0] / X_train.shape[0]

            # compute mean and std (use ddof=0 for population std)
            params = {}
            for col in X_train.columns:
                col_vals = X_c[col].values
                mean = np.mean(col_vals) if col_vals.size > 0 else 0.0
                std = np.std(col_vals, ddof=0) if col_vals.size > 0 else 0.0
                if std <= self.eps_std:
                    std = self.eps_std
                params[col] = {'mean': float(mean), 'std': float(std)}
            self.feature_params[int(c)] = params

    def calculate_likelihood(self, x_value: float, feature_name: str, class_label: int) -> float:
        p = self.feature_params[class_label][feature_name]
        mean, std = p['mean'], p['std']
        likelihood = norm.pdf(x_value, loc=mean, scale=std)
        # clip to avoid exact zero
        return float(np.clip(likelihood, self.eps_likelihood, None))

    def calculate_posterior(self, x_row: pd.Series) -> dict:
        log_post = {}
        for c, prior in self.priors.items():
            # if prior is 0 (shouldn't be with stratify) skip
            if prior <= 0:
                log_post[c] = -np.inf
                continue
            lp = np.log(prior)
            for feature_name, value in x_row.items():
                likelihood = self.calculate_likelihood(value, feature_name, c)
                lp += np.log(likelihood)
            log_post[c] = lp

        # convert log-posteriors to normalized probabilities (stable softmax)
        max_lp = max(log_post.values())
        exps = {c: np.exp(lp - max_lp) for c, lp in log_post.items()}
        s = sum(exps.values())
        probs = {c: exps[c] / s for c in exps}
        return probs

    def predict(self, X_test: pd.DataFrame) -> np.ndarray:
        preds = []
        # iterate rows (acceptable for this dataset; vectorization is possible but more code)
        for _, row in X_test.iterrows():
            probs = self.calculate_posterior(row)
            pred = max(probs, key=probs.get)
            preds.append(int(pred))
        return np.array(preds, dtype=int)


# Train and evaluate
model = BayesianClassifier()
model.fit(X_train, y_train)
print("Model trained.")

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)

Loaded dataset with shape: (4601, 58)
   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_semicolon  \
0             0.00            0.00  ..