## Using Naive Bayes Classifier

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [4]:
df=pd.read_csv("E:\\spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
le_category=LabelEncoder()
df['Category']=le_category.fit_transform(df['Category'])
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
X_train,X_test,Y_train,Y_test=train_test_split(df.Message,df.Category,test_size=0.2)

In [7]:
cv=CountVectorizer()
X_train_count=cv.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
model=MultinomialNB()
model.fit(X_train_count,Y_train)

MultinomialNB()

In [9]:
emails=[
    "Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out!",
    "Yeah hopefully, if tyler can't do it I could maybe ask around a bit"
]
emails_count=cv.transform(emails)
model.predict(emails_count)

array([1, 0])

In [10]:
X_test_count=cv.transform(X_test)
model.score(X_test_count,Y_test)

0.989237668161435

In [11]:
# Instead of Vectorizing training set and testing set and then giving it to the model we can use pipeline that would make our
# work easier and simpler
clf=Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])
clf.fit(X_train,Y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [12]:
clf.score(X_test,Y_test)

0.989237668161435