In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB

In [36]:
# --- Custom Naive Bayes ---
class NaiveBayesModel:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_log_prior_ = None
        self.feature_log_prob_ = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        # Initialize log-prior and log-likelihood probabilities
        self.class_log_prior_ = np.zeros(n_classes)
        self.feature_log_prob_ = np.zeros((n_classes, n_features))
        
        # Loop over each class and compute the likelihood and prior
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.class_log_prior_[idx] = np.log(X_c.shape[0] / n_samples)
            total_word_count = X_c.sum(axis=0) + self.alpha
            total_class_word_count = total_word_count.sum()
            self.feature_log_prob_[idx, :] = np.log(total_word_count / total_class_word_count)

    def predict(self, X):
        log_probs = (X @ self.feature_log_prob_.T) + self.class_log_prior_
        return np.argmax(log_probs, axis=1)

In [37]:
# Load dataset
file_path = r'C:\\Users\\anand\\Downloads\\minorprjct\\Data_set\\mail_data.csv'
mail_data = pd.read_csv(file_path)


In [38]:
# Check for missing values and convert labels to binary
mail_data['Category'] = mail_data['Category'].map({'ham': 0, 'spam': 1})

In [39]:
mail_data.head

<bound method NDFrame.head of       Category                                            Message
0            0  Go until jurong point, crazy.. Available only ...
1            0                      Ok lar... Joking wif u oni...
2            1  Free entry in 2 a wkly comp to win FA Cup fina...
3            0  U dun say so early hor... U c already then say...
4            0  Nah I don't think he goes to usf, he lives aro...
...        ...                                                ...
5567         1  This is the 2nd time we have tried 2 contact u...
5568         0               Will ü b going to esplanade fr home?
5569         0  Pity, * was in mood for that. So...any other s...
5570         0  The guy did some bitching but I acted like i'd...
5571         0                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [40]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

In [41]:
# Split data into features and target
X = mail_data['Message']
y = mail_data['Category']

In [42]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [43]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64


In [44]:
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [45]:
print(f"Training data shape after TF-IDF: {X_train_tfidf.shape}")
print(f"Testing data shape after TF-IDF: {X_test_tfidf.shape}")

Training data shape after TF-IDF: (4457, 3000)
Testing data shape after TF-IDF: (1115, 3000)


In [46]:
print(X_train_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 29947 stored elements and shape (4457, 3000)>
  Coords	Values
  (0, 2084)	0.23837149815751218
  (0, 2899)	0.2728918500894947
  (0, 76)	0.2986941661700567
  (0, 2881)	0.3201420180066482
  (0, 2936)	0.30489377517045985
  (0, 761)	0.3809108381169735
  (0, 2224)	0.220420656710453
  (0, 2448)	0.23837149815751218
  (0, 273)	0.3980992162978338
  (0, 953)	0.3065688134464988
  (0, 2234)	0.28473322471630075
  (1, 1290)	0.2926668657093388
  (1, 2378)	0.35396518723942244
  (1, 2677)	0.3166917055325798
  (1, 886)	0.23624769509144955
  (1, 2152)	0.39380872453276744
  (1, 1311)	0.22578290176521376
  (1, 935)	0.37005777987647276
  (1, 1506)	0.2657720689420296
  (1, 1463)	0.20790448603148223
  (1, 984)	0.41962254886121714
  (2, 692)	0.5021760620862613
  (2, 2563)	0.5550623708560215
  (2, 2649)	0.6631176118361115
  (3, 1292)	0.19570264880521462
  :	:
  (4451, 622)	0.3289459744736164
  (4451, 1458)	0.3222833452878481
  (4452, 840)	0.3574895382

In [47]:
print(X_test_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6924 stored elements and shape (1115, 3000)>
  Coords	Values
  (0, 660)	0.38114337679240273
  (0, 770)	0.3944374447652898
  (0, 823)	0.3292593252139578
  (0, 1276)	0.4392579328937267
  (0, 1515)	0.41223620313833464
  (0, 1569)	0.37052826852389337
  (0, 1830)	0.3009314019684027
  (1, 525)	0.37034060973735533
  (1, 734)	0.3578586983359201
  (1, 1347)	0.3234324946551934
  (1, 1548)	0.3234324946551934
  (1, 2045)	0.387052012561607
  (1, 2579)	0.2204999931204713
  (1, 2626)	0.326271353777915
  (1, 2795)	0.2493471978387002
  (1, 2877)	0.3981347747267476
  (2, 502)	0.5530689808395817
  (2, 903)	0.37140936745963093
  (2, 1228)	0.19302212472396826
  (2, 1233)	0.19302212472396826
  (2, 1374)	0.35262312595844614
  (2, 1659)	0.37140936745963093
  (2, 2123)	0.38473841792677693
  (2, 2593)	0.2671012270734155
  (3, 886)	0.345541635127022
  :	:
  (1110, 823)	0.21105221364350785
  (1110, 906)	0.28156031431289125
  (1110, 932)	0.4092710836282

In [48]:
# Use the best parameters found from tuning
best_n_iters = 1000
best_alpha = 0.1

In [49]:
# --- Train and Evaluate Custom Naive Bayes with Best Params ---
best_custom_nb_model = NaiveBayesModel(alpha=best_alpha)
best_custom_nb_model.fit(X_train_tfidf, y_train)
y_pred_best_nb = best_custom_nb_model.predict(X_test_tfidf)

In [50]:
# Evaluate custom Naive Bayes model
accuracy_best_nb = accuracy_score(y_test, y_pred_best_nb)
print(f"\nCustom Naive Bayes Accuracy: {accuracy_best_nb * 100:.2f}%")
print("Custom Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_best_nb))



Custom Naive Bayes Accuracy: 98.65%
Custom Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.97      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [51]:
# --- Scikit-learn Naive Bayes Implementation ---
sklearn_nb_model = MultinomialNB()
sklearn_nb_model.fit(X_train_tfidf, y_train)
y_pred_sklearn_nb = sklearn_nb_model.predict(X_test_tfidf)

# Evaluate Scikit-learn Naive Bayes model
accuracy_sklearn_nb = accuracy_score(y_test, y_pred_sklearn_nb)
print(f"Scikit-learn Naive Bayes Accuracy: {accuracy_sklearn_nb * 100:.2f}%")
print("Scikit-learn Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_sklearn_nb))


Scikit-learn Naive Bayes Accuracy: 98.39%
Scikit-learn Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

