In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,confusion_matrix

In [7]:
df = pd.read_csv("spam.csv", encoding="latin-1")[['v1','v2']]
df.columns = ["label","text"]
df["label"] = df["label"].map({'ham':0,'spam':1})

In [8]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df["text_length"]=df["text"].apply(len)
df["num_words"]=df["text"].apply(lambda x:len(x.split()))
df["num_digits"]=df["text"].apply(lambda x:sum(c.isdigit() for c in x))


In [10]:
x = df[["text_length","num_words","num_digits"]]
y = df["label"]

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [16]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train_scaled,y_train)
y_pred_gnb = gnb.predict(x_test_scaled)

In [19]:
print(accuracy_score(y_test,y_pred_gnb))

0.9802690582959641


In [24]:
print(confusion_matrix(y_test,y_pred_gnb))

[[953  12]
 [ 10 140]]


In [25]:
#MULTINOMIAL NAIVE BYES

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# Convert the data into word count vectors
vectorizer = CountVectorizer(stop_words='english')
X_count = vectorizer.fit_transform(df['text'])

# Convert counts to TF-IDF representation
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_count)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df['label'], test_size=0.2, random_state=42
)

# Train Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Make predictions
y_pred = mnb.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:", confusion_matrix(y_test, y_pred))
print("Classification report:", classification_report(y_test, y_pred))

Accuracy: 0.968609865470852
Confusion matrix: [[965   0]
 [ 35 115]]
Classification report:               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115



#BERNOULLI NAIVE BAYES

In [27]:
important_words = ["free","win","offer","money","urgent"]
for word in important_words:
    df[word] = df["text"].apply(lambda x : 1 if word in x.lower() else 0)
x=df[important_words]
y=df["label"]


In [28]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [31]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_train,y_train)
y_pred_bnb = bnb.predict(x_test)

In [32]:

print("Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("Confusion matrix:", confusion_matrix(y_test, y_pred_bnb))
print("Classification report:", classification_report(y_test, y_pred_bnb))

Accuracy: 0.895067264573991
Confusion matrix: [[929  36]
 [ 81  69]]
Classification report:               precision    recall  f1-score   support

           0       0.92      0.96      0.94       965
           1       0.66      0.46      0.54       150

    accuracy                           0.90      1115
   macro avg       0.79      0.71      0.74      1115
weighted avg       0.88      0.90      0.89      1115



In [35]:
df = pd.read_csv("spam.csv", encoding="latin-1")[['v1','v2']]
df.columns = ["label","text"]
df["label"] = df["label"].map({'ham':0,'spam':1})

In [36]:


# Display first 5 rows
print(df.head())
df.describe()
print("\nMessage counts:")
print(df["label"].value_counts())

print("\n0 = Not suspicious (Ham)")
print("1 = Suspicious (Spam)")
X = df["text"]
y = df["label"]
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Split data
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes
model = MultinomialNB()
model.fit(X_train_vec, y_train)

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...

Message counts:
label
0    4825
1     747
Name: count, dtype: int64

0 = Not suspicious (Ham)
1 = Suspicious (Spam)
Accuracy: 0.9668161434977578
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
 [[965   0]
 [ 37 113]]
