### load and clean dataset

In [18]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#open spam.csv
df = pd.read_csv('spam.csv')

In [19]:
#drop column 2,3,4
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
#print first 10 rows
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


### vectorize text in dataset

In [20]:
#split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(df.v2, df.v1, test_size=0.2, random_state=1)

#vectorize the text
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Train Naive Bayes

In [21]:
#train naive bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)

#evaluate model
y_pred = nb.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))


Accuracy:  0.9847533632286996
Confusion Matrix: 
 [[968   8]
 [  9 130]]
Classification Report: 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       976
        spam       0.94      0.94      0.94       139

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Receive input from user

In [42]:
#receive input from user
user_input = input("Enter a message: ")

#predict user input
user_input = vectorizer.transform([user_input])
prediction = nb.predict(user_input)
print(prediction)

#show probability of prediction
prob = nb.predict_proba(user_input)
print(prob)

['ham']
[[0.6053041 0.3946959]]
