In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
msg=pd.read_csv('naivetext.csv',names=['message','label'])
msg.head()

Unnamed: 0,message,label
0,I love this sandwich,pos
1,This is an amazing place,pos
2,I feel very good about these beers,pos
3,This is my best work,pos
4,What an awesome view,pos


In [3]:
msg.shape

(18, 2)

In [4]:
msg['labelnum']=msg.label.map({'pos':1,'neg':0})

In [5]:
X=msg.message
y=msg.labelnum
y

0     1
1     1
2     1
3     1
4     1
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    0
14    1
15    0
16    1
17    0
Name: labelnum, dtype: int64

In [6]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.25)

In [7]:
#CountVectorizer - to get freuency of words
count_vect=CountVectorizer()

In [8]:
#Document Term Matrix (DTM)
Xtrain_dtm=count_vect.fit_transform(Xtrain)

In [15]:
Xtest_dtm=count_vect.transform(Xtest)

In [16]:
print("Words present in text document")
print(count_vect.get_feature_names())

Words present in text document
['about', 'am', 'an', 'and', 'awesome', 'bad', 'beers', 'best', 'dance', 'do', 'enemy', 'feel', 'good', 'great', 'he', 'holiday', 'house', 'is', 'like', 'locality', 'love', 'my', 'not', 'of', 'place', 'restaurant', 'sandwich', 'sick', 'stay', 'stuff', 'sworn', 'that', 'these', 'this', 'tired', 'to', 'today', 'very', 'view', 'went', 'what', 'work']


In [18]:
df=pd.DataFrame(Xtrain_dtm.toarray(),columns=count_vect.get_feature_names())
df.head()

Unnamed: 0,about,am,an,and,awesome,bad,beers,best,dance,do,...,these,this,tired,to,today,very,view,went,what,work
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [19]:
#Multinomial Naive Bayes Classifer
mclf=MultinomialNB().fit(Xtrain_dtm,ytrain)

In [21]:
predicted=mclf.predict(Xtest_dtm)
predicted

array([0, 0, 1, 1, 1])

In [22]:
print("Confusion Matrix")
print(metrics.confusion_matrix(ytest,predicted))

Confusion Matrix
[[2 1]
 [0 2]]


In [23]:
print("Acuuracy:",metrics.accuracy_score(ytest,predicted))

Acuuracy: 0.8


In [24]:
print("Precision:",metrics.precision_score(ytest,predicted))

Precision: 0.6666666666666666


In [25]:
print("Recall:",metrics.recall_score(ytest,predicted))

Recall: 1.0


In [26]:
newText = ["my boss is best"]
newText

['my boss is best']

In [27]:
newText_dtm=count_vect.transform(newText)

In [28]:
newText_predicted=mclf.predict(newText_dtm)
newText_predicted

array([1])