In [None]:
#naive bayes
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
#load xls
data = pd.read_excel('/content/spam.xls')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
#drop last 3 columns
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [None]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#convert upper case to lower case

data['v2'] = data['v2'].astype(str).apply(lambda x:x.lower())

In [None]:
#remove punctuation and stop words
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data['v2'] = data['v2'].apply(remove_punctuation)

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

data['v2'] = data['v2'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#apply stemming or lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(text)])

data['v2'] = data['v2'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
data.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think go usf life around though


In [None]:
#convert text into numeric form using tf-idf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['v2'])

In [None]:
print('\nidf values:')
for ele1, ele2 in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
    print(ele1, ':', ele2)


idf values:
008704050406 : 8.527076498901426
0089my : 8.93254160700959
0121 : 8.93254160700959
01223585236 : 8.93254160700959
01223585334 : 8.527076498901426
0125698789 : 8.93254160700959
02 : 8.93254160700959
020603 : 8.016250875135436
0207 : 8.527076498901426
02070836089 : 8.93254160700959
02072069400 : 8.93254160700959
02073162414 : 8.527076498901426
02085076972 : 8.93254160700959
020903 : 8.527076498901426
021 : 8.527076498901426
050703 : 8.527076498901426
0578 : 8.527076498901426
06 : 8.93254160700959
060505 : 8.93254160700959
061104 : 8.93254160700959
07008009200 : 8.93254160700959
07046744435 : 8.93254160700959
07090201529 : 8.93254160700959
07090298926 : 8.93254160700959
07099833605 : 8.93254160700959
071104 : 8.527076498901426
07123456789 : 8.527076498901426
0721072 : 8.93254160700959
07732584351 : 8.93254160700959
07734396839 : 8.527076498901426
07742676969 : 8.527076498901426
07753741225 : 8.93254160700959
0776xxxxxxx : 8.527076498901426
07786200117 : 8.527076498901426
077x

In [None]:
print('\nWord indexes:')
print(vectorizer.vocabulary_)
print('\ntf-idf value:')
print(X)
print('\ntf-idf values in matrix form:')
print(X.toarray())


Word indexes:
{'go': 3394, 'jurong': 4194, 'point': 5720, 'crazy': 2292, 'available': 1350, 'bugis': 1766, 'great': 3480, 'world': 8060, 'la': 4335, 'buffet': 1764, 'cine': 2051, 'get': 3354, 'amore': 1143, 'wat': 7855, 'ok': 5335, 'lar': 4370, 'joking': 4164, 'wif': 7972, 'oni': 5366, 'free': 3210, 'entry': 2863, 'wkly': 8018, 'comp': 2154, 'win': 7984, 'fa': 2985, 'cup': 2339, 'final': 3084, 'tkts': 7362, '21st': 450, 'may': 4742, '2005': 437, 'text': 7218, '87121': 859, 'receive': 6068, 'questionstd': 5973, 'txt': 7544, 'ratetcs': 6020, 'apply': 1225, '08452810075over18s': 71, 'dun': 2737, 'say': 6357, 'early': 2755, 'hor': 3775, 'already': 1117, 'nah': 5042, 'dont': 2659, 'think': 7287, 'usf': 7681, 'life': 4446, 'around': 1267, 'though': 7304, 'freemsg': 3218, 'hey': 3681, 'darling': 2399, 'week': 7897, 'word': 8049, 'back': 1394, 'id': 3881, 'like': 4459, 'fun': 3274, 'still': 6914, 'tb': 7158, 'xxx': 8177, 'std': 6891, 'chgs': 2000, 'send': 6429, '150': 350, 'rcv': 6031, 'even'

In [None]:
data.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think go usf life around though


In [None]:
#train naive bayes

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['v1'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
# Make predictions on test data

y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
#accuracy precision recall f1score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")

recall= recall_score(y_test, y_pred)
print(f"Recall: {recall}")

f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

Accuracy: 0.9614349775784753
Precision: 1.0
Recall: 0.7133333333333334
F1 Score: 0.8326848249027238


In [None]:
#confusion matrix of correct vs incorrect redictions
from sklearn.metrics import confusion_matrix

confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)


Confusion Matrix:
[[965   0]
 [ 43 107]]
