<a href="https://colab.research.google.com/github/AmSharma05/NaiveBayesvsNN/blob/main/NaiveBayesAlgorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv("spam.csv", encoding='latin-1')
data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1, inplace=True)
data.rename(columns={"v1":"label", "v2":"text"}, inplace=True)

# Convert labels to binary
data["label"] = np.where(data["label"]=="spam", 1, 0)

# Preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize
    tokens = re.findall('\w+', text)

    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]

    # Stem using Porter Stemming Algorithm
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Combine tokens into a string
    return ' '.join(stemmed_tokens)

data["text"] = data["text"].apply(preprocess)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.25, random_state=42)

# Convert text to matrix of token counts
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train Naive Bayes classifier
clf = MultinomialNB().fit(X_train_counts, y_train)

# Predict on test set
y_pred = clf.predict(X_test_counts)

# Calculate evaluation metrics
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("F1 score:", f1)
print("Precision:", precision)
print("Accuracy:", accuracy)
print("Recall:", recall)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


F1 score: 0.9376693766937669
Precision: 0.9719101123595506
Accuracy: 0.9834888729361091
Recall: 0.9057591623036649
