<a href="https://colab.research.google.com/github/Demon-Sheriff/Linear-Alg_ML_fs/blob/master/Naive_bayes_for_Blog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
class NaiveBayes():

  def __init__(self):

    self.class_priors = {} # the class priors (0 -> ham, 1 -> spam)
    self.vocab_size = None # number of features OR number of unique words in the vocabulary
    self.spam_likelihoods = None # feature probabilities for spam class
    self.ham_likelihoods = None # feature probabilities for ham class

  def fit(self, X_train, y_train, alpha=1.0):

    """ Train Naive Bayes with Laplace Smoothing """
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    n_samples ,self.vocab_size = X_train.shape # number of training examples, number of unique words.

    # optimal parameter calculation ⬇️

    # compute the class priors
    spam_count = y_train.sum()
    ham_count = n_samples - spam_count
    self.class_priors[1] = np.log(spam_count / n_samples)  # p(spam)
    self.class_priors[0] = np.log(ham_count / n_samples)  # p(ham)

    # compute the feature probabilities with Laplace smoothing (alpha represents the smoothing parameter)
    spam_word_count = X_train[y_train == 1].sum(axis=0) + alpha
    ham_word_count = X_train[y_train == 0].sum(axis=0) + alpha
    total_spam_words = spam_word_count.sum() + self.vocab_size * alpha
    total_ham_words = ham_word_count.sum() + self.vocab_size * alpha

    self.log_spam_likelihoods = np.log(spam_word_count / total_spam_words)
    self.log_ham_likelihoods = np.log(ham_word_count / total_ham_words)

  def predict(self, X_test):

    """ Predict class labels for input samples """
    X_test = np.array(X_test)

    log_prob_spam = self.class_priors[1] + X_test @ self.log_spam_likelihoods.T
    log_prob_ham = self.class_priors[0] + X_test @ self.log_ham_likelihoods.T

    # Predict spam if log_prob_spam > log_prob_ham
    return (log_prob_spam > log_prob_ham).astype(int)

In [None]:
X = np.array([[1,2,3],[4,5,6],[7,8,9]])
X

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [None]:
y = [1,1,0]
y = np.array(y)

In [None]:
y == 1

array([ True,  True, False])

In [None]:
X[y == 1]

array([[1, 2, 3],
       [4, 5, 6]])

In [None]:
X[y == 1].sum(axis=0) + 1

array([ 6,  8, 10])

In [None]:
y.sum()

2

In [None]:
(X[y == 1] + 1) / y.sum() + 2

array([[3. , 3.5, 4. ],
       [4.5, 5. , 5.5]])

In [None]:
import numpy as np

class NaiveBayes():

  def __init__(self):

    self.class_priors = {} # the class priors (0 -> ham, 1 -> spam)
    self.vocab_size = None # number of features OR number of unique words in the vocabulary
    self.spam_likelihood = {} # feature probability for spam class
    self.ham_likelihood = {} # feature probability for ham class

  def fit(self, X_train, y_train, alpha=1.0):

    """ Train Naive Bayes with Laplace Smoothing """
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    n_samples ,self.vocab_size = X_train.shape # number of training examples, number of unique words.

    # optimal parameter calculation ⬇️

    # compute the class priors
    spam_count = y_train.sum()
    ham_count = n_samples - spam_count
    self.class_priors[1] = np.log(spam_count / n_samples) # p(spam)
    self.class_priors[0] = np.log(ham_count / n_samples) # p(ham)

    # compute the feature probabilities with Laplace smoothing (alpha represents the smoothing parameter)
    spam_word_count = X_train[y_train == 1].sum(axis=0) + alpha
    ham_word_count = X_train[y_train == 0].sum(axis=0) + alpha
    total_spam_words = spam_word_count.sum() + self.vocab_size * alpha
    total_ham_words = ham_word_count.sum() + self.vocab_size * alpha

    self.log_spam_likelihoods = np.log(spam_word_count / total_spam_words)
    self.log_ham_likelihoods = np.log(ham_word_count / total_ham_words)


  def predict(self, X_test):

    """ Predict class labels for input samples """
    X_test = np.array(X_test)

    log_prob_spam = self.class_priors[1] + X_test @ self.log_spam_likelihoods.T
    log_prob_ham = self.class_priors[0] + X_test @ self.log_ham_likelihoods.T

    # Predict spam if log_prob_spam > log_prob_ham
    return (log_prob_spam > log_prob_ham).astype(int)

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
spambase = fetch_openml('spambase', version=1, as_frame=True)

In [None]:
print(spambase)

{'data':       word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0               0.00               0.64           0.64           0.0   
1               0.21               0.28           0.50           0.0   
2               0.06               0.00           0.71           0.0   
3               0.00               0.00           0.00           0.0   
4               0.00               0.00           0.00           0.0   
...              ...                ...            ...           ...   
4596            0.31               0.00           0.62           0.0   
4597            0.00               0.00           0.00           0.0   
4598            0.30               0.00           0.30           0.0   
4599            0.96               0.00           0.00           0.0   
4600            0.00               0.00           0.65           0.0   

      word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0              0.32            0.00              

In [None]:
X, y = spambase.data, spambase.target

In [None]:
X

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.0,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.0,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78


In [None]:
import pandas as pd

In [None]:
import kagglehub

path = kagglehub.dataset_download("venky73/spam-mails-dataset")
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/venky73/spam-mails-dataset/versions/1


In [None]:
import os
data = pd.read_csv(os.path.join(path, "spam_ham_dataset.csv"))

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
data[data['label_num'] == 0].shape[0], data[data['label_num'] == 1].shape[0]

(3672, 1499)

In [None]:
import re # Regular expressions for text cleaning (removing special characters, numbers, etc.)
import nltk # Natural Language Toolkit for text processing tasks
from nltk.corpus import stopwords # Stopwords list to remove common words like "the", "and", etc.
from nltk.tokenize import word_tokenize # Tokenization to split text into words
from nltk.stem import WordNetLemmatizer # Lemmatization to reduce words to their base form
from sklearn.feature_extraction.text import CountVectorizer  # Converts text into a bag-of-words representation`

In [None]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    # 1. Lowercasing
    text = text.lower()

    # 2. Remove special characters, numbers & stopwords
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stop_words]  # Remove stopwords

    # 3. Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [None]:
data['clean_text'] = data['text'].apply(preprocess_text)

In [None]:
data.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num', 'clean_text'], dtype='object')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,clean_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom january see attached file hpln...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonderful...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop window office cheap main tre...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject indian spring deal book teco pvr reven...


In [None]:
clean_data = data.drop(columns=['text', 'Unnamed: 0', 'label'])

In [None]:
clean_data.head()

Unnamed: 0,label_num,clean_text
0,0,subject enron methanol meter follow note gave ...
1,0,subject hpl nom january see attached file hpln...
2,0,subject neon retreat ho ho ho around wonderful...
3,1,subject photoshop window office cheap main tre...
4,0,subject indian spring deal book teco pvr reven...


In [None]:
clean_data['label_num'].value_counts()

Unnamed: 0_level_0,count
label_num,Unnamed: 1_level_1
0,3672
1,1499


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_text, X_test_text, y_train, y_test = train_test_split(clean_data['clean_text'], clean_data['label_num'], test_size=0.2, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train_text)  # Learn vocabulary from training set

In [None]:
X_test = vectorizer.transform(X_test_text)

In [None]:
print(X_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
print("Feature Names:", vectorizer.get_feature_names_out())  # Check vocabulary
print("========================================================================================================================")
print("Vectorized Data:\n", X_train.toarray())  # See transformed data
print("========================================================================================================================")
print("Vocabulary:", vectorizer.vocabulary_)  # Shows learned words

Feature Names: ['aa' 'aaa' 'aabda' ... 'zzo' 'zzocb' 'zzsyt']
Vectorized Data:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
custom_clf = NaiveBayes()

In [None]:
y_train

Unnamed: 0,label_num
5132,0
2067,1
4716,0
4710,0
2268,1
...,...
4426,0
466,0
3092,1
3772,0


In [None]:
custom_clf.fit(X_train.toarray(), y_train)

In [None]:
X_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y_pred = custom_clf.predict(X_test.toarray())

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_test

Unnamed: 0,label_num
1566,0
1988,1
1235,0
2868,0
4903,0
...,...
1175,1
4476,0
4198,1
2689,0


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       742
           1       0.97      0.94      0.95       293

    accuracy                           0.97      1035
   macro avg       0.97      0.96      0.97      1035
weighted avg       0.97      0.97      0.97      1035



In [None]:
from collections import Counter
print("Class distribution before SMOTE:", Counter(y_train))

Class distribution before SMOTE: Counter({0: 2930, 1: 1206})


In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
label_num,Unnamed: 1_level_1
0,2930
1,1206


In [None]:
from imblearn.over_sampling import SMOTE


In [None]:
# apply smote resampling to balance the classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train.toarray(), y_train)

In [None]:
# check the class distribution
print("Class distribution after SMOTE:", Counter(y_train_resampled))

Class distribution after SMOTE: Counter({0: 2930, 1: 2930})


In [None]:
new_clf = NaiveBayes()

In [None]:
new_clf.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_new = new_clf.predict(X_test.toarray())

In [None]:
print(classification_report(y_test, y_pred_new))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       742
           1       0.96      0.96      0.96       293

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
sk_clf = MultinomialNB()

In [None]:
sk_clf.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_sk = sk_clf.predict(X_test.toarray())

In [None]:
print(classification_report(y_test, y_pred_sk))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       742
           1       0.95      0.96      0.96       293

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035

