In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from NaiveBayes import MultinomialNB
import re

# 1. EDA

In [24]:
df_train = pd.read_csv('train.csv')
df_train.drop(columns=['Message ID', 'Unnamed: 0', 'split'], inplace=True)

df_val = pd.read_csv('val.csv')
df_val.drop(columns=['Message ID', 'Unnamed: 0', 'split'], inplace=True)

In [25]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27284 entries, 0 to 27283
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Subject   27055 non-null  object
 1   Message   26932 non-null  object
 2   Spam/Ham  27284 non-null  object
dtypes: object(3)
memory usage: 639.6+ KB


In [26]:
print(df_train['Spam/Ham'].value_counts())

Spam/Ham
spam    13858
ham     13426
Name: count, dtype: int64


In [27]:
df_train.head()

Unnamed: 0,Subject,Message,Spam/Ham
0,christmas tree farm pictures,,ham
1,"vastar resources , inc .","gary , production from the high island larger ...",ham
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham
3,re : issue,fyi - see note below - already done .\nstella\...,ham
4,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham


In [28]:
print(f"Number of rows: {len(df_train)}")
print(f"Number of rows: {len(df_val)}")

Number of rows: 27284
Number of rows: 3084


In [29]:
# Check for missing values

print("\nMissing values in training set:")
print(df_train.isnull().sum())
print("\nMissing values in validation set:")
print(df_val.isnull().sum())


Missing values in training set:
Subject     229
Message     352
Spam/Ham      0
dtype: int64

Missing values in validation set:
Subject     29
Message     35
Spam/Ham     0
dtype: int64


# 2. Preprocessing

In [30]:
# Fill any NaN values with empty strings
df_train['Subject'] = df_train['Subject'].fillna('')
df_train['Message'] = df_train['Message'].fillna('')
df_val['Subject'] = df_val['Subject'].fillna('')
df_val['Message'] = df_val['Message'].fillna('')

# Create Bag of Words vectorizer for text features
bow = CountVectorizer(stop_words='english')

# Combine Subject and Message columns
df_train['text'] = df_train['Subject'] + ' ' + df_train['Message']
df_val['text'] = df_val['Subject'] + ' ' + df_val['Message']

# Transform text data to BoW features
X_train = bow.fit_transform(df_train['text'])
X_val = bow.transform(df_val['text'])

# Get labels
y_train = df_train['Spam/Ham']
y_val = df_val['Spam/Ham']

In [31]:
# Convert sparse matrices to dense arrays properly
X_train = X_train.toarray()
y_train = np.array(y_train)
X_val = X_val.toarray()
y_val = np.array(y_val)

# 3. Naive Bayes

In [34]:
# # Implement Multinomial Naive Bayes from scratch
# class MultinomialNB:
#     def __init__(self, alpha=1.0):
#         self.alpha = alpha  # Laplace smoothing parameter
#         self.class_priors = None
#         self.feature_probs = None
#         self.classes = None
        
#     def fit(self, X, y):
#         # Check if X is empty or 1-dimensional
#         if X.size == 0:
#             raise ValueError("Input array X is empty")
#         if len(X.shape) == 1:
#             X = X.reshape(1, -1)
            
#         # Get shape, handling both sparse and dense matrices
#         n_samples = X.shape[0]
#         n_features = X.shape[1]
            
#         self.classes = np.unique(y)
#         n_classes = len(self.classes)
        
#         # Calculate class priors P(y)
#         self.class_priors = np.zeros(n_classes)
#         for i, c in enumerate(self.classes):
#             self.class_priors[i] = np.sum(y == c) / n_samples
            
#         # Calculate feature probabilities P(x|y) with Laplace smoothing
#         # Process one class at a time to reduce memory usage
#         self.feature_probs = np.zeros((n_classes, n_features))
#         for i, c in enumerate(self.classes):
#             # Get indices of samples belonging to current class
#             class_indices = np.where(y == c)[0]
            
#             # Calculate feature counts in batches
#             feature_counts = np.zeros(n_features) + self.alpha
#             batch_size = 1000  # Adjust based on available memory
            
#             for start_idx in range(0, len(class_indices), batch_size):
#                 end_idx = min(start_idx + batch_size, len(class_indices))
#                 batch_indices = class_indices[start_idx:end_idx]
                
#                 # Sum features for current batch
#                 if isinstance(X, np.ndarray):
#                     batch_sum = X[batch_indices].sum(axis=0)
#                 else:
#                     batch_sum = X[batch_indices].toarray().sum(axis=0)
                    
#                 feature_counts += batch_sum
                
#             total_counts = feature_counts.sum()
#             self.feature_probs[i] = feature_counts / total_counts
            
#     def predict(self, X):
#         # Handle 1-dimensional input
#         if len(X.shape) == 1:
#             X = X.reshape(1, -1)
            
#         # Predict in batches to save memory
#         predictions = []
#         batch_size = 1000  # Adjust based on available memory
        
#         for start_idx in range(0, X.shape[0], batch_size):
#             end_idx = min(start_idx + batch_size, X.shape[0])
#             if isinstance(X, np.ndarray):
#                 batch = X[start_idx:end_idx]
#             else:
#                 batch = X[start_idx:end_idx].toarray()
                
#             batch_predictions = np.array([self._predict_single(x) for x in batch])
#             predictions.extend(batch_predictions)
            
#         return np.array(predictions)
    
#     def _predict_single(self, x):
#         # Calculate log probabilities to prevent numerical underflow
#         log_probs = np.log(self.class_priors)
        
#         # Add log of feature probabilities where feature is present (x > 0)
#         for i in range(len(self.classes)):
#             # Only consider non-zero features to save computation
#             present_features = x > 0
#             if np.any(present_features):
#                 # Multiply probability by feature count (for multinomial)
#                 log_probs[i] += np.sum(np.log(self.feature_probs[i][present_features]) * x[present_features])
        
#         # Return class with highest probability
#         return self.classes[np.argmax(log_probs)]

# Train the model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions
y_pred = nb.predict(X_val)

# Calculate accuracy
accuracy = np.mean(y_pred == y_val)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.9903
