In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import numpy as np

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load the preprocessed and tokenized email dataset into a DataFrame
file_path = 'lower_withoutpunctuation_tokenized.csv'  # Replace 'your_dataset.csv' with your dataset path
df = pd.read_csv(file_path)

# Check the unique values in the 'Category' column (0 for not spam, 1 for spam)
print(df['Category'].unique())

# Check the class distribution in 'Category' column
print(df['Category'].value_counts())

# Split the data into training and testing sets (80-20 ratio) with a fixed random seed
X_train, X_test, y_train, y_test = train_test_split(
    df['Processed_Message'], df['Category'], test_size=0.2, random_state=random_seed
)

# Create a pipeline with TF-IDF vectorizer and MultinomialNB classifier
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict on the test set
predictions = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy}")

# Save predicted categories (spam or not spam) to a CSV file
results_df = pd.DataFrame({'Processed_Message': X_test, 'Actual_Category': y_test, 'Predicted_Category': predictions})
results_df.to_csv('predicted_emails.csv', index=False)

# Display the DataFrame with actual and predicted categories
print(results_df)


[0 1]
0    4825
1     747
Name: Category, dtype: int64
Model Accuracy: 0.9542600896860987
                                      Processed_Message  Actual_Category  \
3245  ['squeeeeeze', 'this', 'is', 'christmas', 'hug...                0   
944   ['and', 'also', 'ive', 'sorta', 'blown', 'him'...                0   
1044  ['mmm', 'thats', 'better', 'now', 'i', 'got', ...                0   
2484  ['mm', 'have', 'some', 'kanji', 'dont', 'eat',...                0   
812   ['so', 'theres', 'a', 'ring', 'that', 'comes',...                0   
...                                                 ...              ...   
4264  ['den', 'only', 'weekdays', 'got', 'special', ...                0   
2439  ['i', 'not', 'busy', 'juz', 'dun', 'wan', '2',...                0   
5556  ['yes', 'i', 'have', 'so', 'thats', 'why', 'u'...                0   
4205  ['how', 'are', 'you', 'enjoying', 'this', 'sem...                0   
4293                                            ['gwr']                0  