In [5]:
import pandas as pd
import numpy as np
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Import and understand the dataset
df = pd.read_csv('spam_ham_dataset.csv')
df.replace({r'\r\n':' '}, regex=True, inplace=True)

# Implementing the BOW algorithm
ps = PorterStemmer()
corpus = []

all_stop_words = set(stopwords.words('english'))
all_stop_words.remove('not')

for i in range(len(df)):
    text = df['text'][i].lower().translate(str.maketrans('', '', string.punctuation)).split()
    text = [ps.stem(word) for word in text if word not in all_stop_words]
    text = ' '.join(text)
    corpus.append(text)

# Creating BOW model
cv = CountVectorizer(max_features=42500)
X = cv.fit_transform(corpus).toarray()
y = df['label_num']

# Splitting the dataset into the training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smile\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Training the Model
nb_multinomial = MultinomialNB()
nb_multinomial.fit(X_train, y_train)
y_pred_nb = nb_multinomial.predict(X_test)

# Model Performance
def model_score(y_true, y_pred):
    acc_scor = accuracy_score(y_true, y_pred)
    prec_scor = precision_score(y_true, y_pred)
    recall_scor = recall_score(y_true, y_pred)
    f1_scor = f1_score(y_true, y_pred)
    overall_avg_score = (acc_scor + prec_scor + recall_scor + f1_scor) / 4

    print(f'Model accuracy score: {acc_scor}')
    print(f'Model precision score: {prec_scor}')
    print(f'Model recall score: {recall_scor}')
    print(f'Model f1 score: {f1_scor}')
    print(f'Average overall score performance: {overall_avg_score}')

    print(confusion_matrix(y_true, y_pred))

# Model Performance
model_score(y_test, y_pred_nb)


Model accuracy score: 0.966183574879227
Model precision score: 0.9466666666666667
Model recall score: 0.9372937293729373
Model f1 score: 0.9419568822553899
Average overall score performance: 0.9480252132935552
[[716  16]
 [ 19 284]]


In [8]:
# Function to classify text using Naive Bayes
def classify_text(input_text):
    ps = PorterStemmer()
    input_text = input_text.lower().translate(str.maketrans('', '', string.punctuation)).split()
    input_text = [ps.stem(word) for word in input_text if word not in all_stop_words]
    input_text = ' '.join(input_text)

    input_vector = cv.transform([input_text]).toarray()

    nb_prediction = nb_multinomial.predict(input_vector)

    return nb_prediction[0]

# User input
user_input = input("Enter a text to classify: ")

# Classify using Naive Bayes
nb_result = classify_text(user_input)

# Print result
print(f'\nNaive Bayes Classifier Prediction: {"Spam" if nb_result == 1 else "Ham"}')


Enter a text to classify: ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time ! i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute .

Naive Bayes Classifier Prediction: Ham


In [9]:
# Function to classify text using Naive Bayes
def classify_text(input_text):
    ps = PorterStemmer()
    input_text = input_text.lower().translate(str.maketrans('', '', string.punctuation)).split()
    input_text = [ps.stem(word) for word in input_text if word not in all_stop_words]
    input_text = ' '.join(input_text)

    input_vector = cv.transform([input_text]).toarray()

    nb_prediction = nb_multinomial.predict(input_vector)

    return nb_prediction[0]

# User input
user_input = input("Enter a text to classify: ")

# Classify using Naive Bayes
nb_result = classify_text(user_input)

# Print result
print(f'\nNaive Bayes Classifier Prediction: {"Spam" if nb_result == 1 else "Ham"}')


Enter a text to classify: abasements darer prudently fortuitous undergone lighthearted charm orinoco taster railroad affluent pornographic cuvier irvin parkhouse blameworthy chlorophyll robed diagrammatic fogarty clears bayda inconveniencing managing represented smartness hashish academies shareholders unload badness danielson pure caffein spaniard chargeable levin

Naive Bayes Classifier Prediction: Spam
