In [None]:






















import pandas as pd

In [2]:
data = pd.read_csv('data.csv', encoding='latin-1')# handle any potential encoding issues.
data

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,spam
1,Subject: the stock trading gunslinger fanny i...,spam
2,Subject: unbelievable new homes made easy im ...,spam
3,Subject: 4 color printing special request add...,spam
4,"Subject: do not have money , get software cds ...",spam
...,...,...
495,Subject: re : invitation to speak at power 200...,not spam
496,"Subject: re : aram ' s visit jesus , i yalke...",not spam
497,Subject: - - - - - - - - - - - - - - - - - - -...,not spam
498,Subject: reschedule meeting for duffie report ...,not spam


In [3]:
import re

# Ensure all text values are strings
data['text'] = data['text'].fillna('').astype(str)

# Preprocess function: lowercase and remove non-alphabetic characters
def preprocess(text):
    return re.sub(r'\W+', ' ', text).lower().split()

# Apply preprocessing
data['text'] = data['text'].apply(preprocess)


In [4]:
data['Spam']=data['label'].apply(lambda x:1 if x=='spam' else 0)
data.head(5)

Unnamed: 0,text,label,Spam
0,"[subject, naturally, irresistible, your, corpo...",spam,1
1,"[subject, the, stock, trading, gunslinger, fan...",spam,1
2,"[subject, unbelievable, new, homes, made, easy...",spam,1
3,"[subject, 4, color, printing, special, request...",spam,1
4,"[subject, do, not, have, money, get, software,...",spam,1


In [5]:
from sklearn.model_selection import train_test_split

X = data['text']
y = data['Spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [6]:
def calculate_prior(y_train):
    total = len(y_train)
    p_spam = sum(y_train) / total
    p_ham = 1 - p_spam
    return p_spam, p_ham

p_spam, p_ham = calculate_prior(y_train)
calculate_prior(y_train)

(0.504, 0.496)

In [7]:
from collections import defaultdict

def calculate_likelihood(X_train, y_train):
    spam_words = defaultdict(int)
    ham_words = defaultdict(int)
    total_spam, total_ham = 0, 0

    for i, message in enumerate(X_train):
        for word in message:
            if y_train.iloc[i] == 1:  # Spam
                spam_words[word] += 1
                total_spam += 1
            else: 
                ham_words[word] += 1
                total_ham += 1

    return spam_words, ham_words, total_spam, total_ham

spam_words, ham_words, total_spam, total_ham = calculate_likelihood(X_train, y_train)
#calculate_likelihood(X_train, y_train)

In [8]:
def classify(message, spam_words, ham_words, total_spam, total_ham, p_spam, p_ham, alpha=1):
    words = preprocess(message)
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in words:
        p_spam_given_message *= (spam_words[word] + alpha) / (total_spam + alpha * len(spam_words))
        p_ham_given_message *= (ham_words[word] + alpha) / (total_ham + alpha * len(ham_words))

    return 1 if p_spam_given_message > p_ham_given_message else 0

In [9]:
y_pred = X_test.apply(lambda x: classify(' '.join(x), spam_words, ham_words, total_spam, total_ham, p_spam, p_ham))


# Calculate accuracy
accuracy = sum(y_pred == y_test) / len(y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 75.20%


In [10]:
def predict_and_evaluate(new_message):

    prediction = classify(new_message, spam_words, ham_words, total_spam, total_ham, p_spam, p_ham, alpha=1)
    # Output the prediction
    if prediction == 1:
        result = "The message is spam."
    else:
        result = "The message is not spam."


    return result

# Example usage
new_message = input("Enter the new mail: ")
result = predict_and_evaluate(new_message)
print(result)


Enter the new mail: hi
The message is not spam.


In [13]:
import pandas as pd
import joblib
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Load data
data = pd.read_csv('data.csv', encoding='latin-1')

# Ensure text column is string and preprocess
data['text'] = data['text'].fillna('').astype(str)

# Preprocess function: lowercase & remove non-alphabetic characters
def preprocess(text):
    return re.sub(r'\W+', ' ', text).lower()

data['text'] = data['text'].apply(preprocess)
data['Spam'] = data['label'].apply(lambda x: 1 if x == 'spam' else 0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['Spam'], test_size=0.25, random_state=42)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Naïve Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Evaluate models
nb_pred = nb_model.predict(X_test_tfidf)
lr_pred = lr_model.predict(X_test_tfidf)

nb_acc = accuracy_score(y_test, nb_pred)
lr_acc = accuracy_score(y_test, lr_pred)

nb_conf_matrix = confusion_matrix(y_test, nb_pred)
lr_conf_matrix = confusion_matrix(y_test, lr_pred)

print(f"Naïve Bayes Accuracy: {nb_acc * 100:.2f}%")
print(f"Logistic Regression Accuracy: {lr_acc * 100:.2f}%")

print("\nNaïve Bayes Confusion Matrix:\n", nb_conf_matrix)
print("\nLogistic Regression Confusion Matrix:\n", lr_conf_matrix)

# Save models and vectorizer
joblib.dump(nb_model, "naive_bayes_model.pkl")
joblib.dump(lr_model, "logistic_regression_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("Models and vectorizer saved successfully!")


Naïve Bayes Accuracy: 97.60%
Logistic Regression Accuracy: 98.40%

Naïve Bayes Confusion Matrix:
 [[64  1]
 [ 2 58]]

Logistic Regression Confusion Matrix:
 [[63  2]
 [ 0 60]]
Models and vectorizer saved successfully!
