In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split as TTS
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.naive_bayes import MultinomialNB as MNB
from numpy import isreal
import string
from sklearn import metrics
data = pd.read_csv("../Documents/Datasets/bbc-text.csv")
len(data)

2225

In [2]:
classes = list(set(data.category))
print("Classes in the dataset are : ",classes)

Classes in the dataset are :  ['tech', 'entertainment', 'politics', 'business', 'sport']


In [3]:
classdict = {}
revclassdict = {}
for _class in classes:
    classdict[_class] = classes.index(_class)

revclassdict = {val : key for key, val in classdict.items()}
print(classdict)
data['numcategory'] = data['category'].map(classdict)
data.head(20)

{'tech': 0, 'entertainment': 1, 'politics': 2, 'business': 3, 'sport': 4}


Unnamed: 0,category,text,numcategory
0,tech,tv future in the hands of viewers with home th...,0
1,business,worldcom boss left books alone former worldc...,3
2,sport,tigers wary of farrell gamble leicester say ...,4
3,sport,yeading face newcastle in fa cup premiership s...,4
4,entertainment,ocean s twelve raids box office ocean s twelve...,1
5,politics,howard hits back at mongrel jibe michael howar...,2
6,politics,blair prepares to name poll date tony blair is...,2
7,sport,henman hopes ended in dubai third seed tim hen...,4
8,sport,wilkinson fit to face edinburgh england captai...,4
9,entertainment,last star wars not for children the sixth an...,1


In [4]:
table1 = str.maketrans("", "", string.digits)
table2 = str.maketrans("", "", string.punctuation)
data.text = data.text.apply(lambda x: x.translate(table1))
data.text = data.text.apply(lambda x: x.translate(table2))

In [5]:
X_train, X_test, y_train, y_test = TTS(data.text, data.numcategory, test_size=0.285, random_state=69)
print('The train and test sizes are', len(X_train), 'and', len(X_test), 'respectively')

The train and test sizes are 1590 and 635 respectively


In [6]:
DocVector = CV()
X_train_dtm = DocVector.fit_transform(X_train)
X_test_dtm = DocVector.transform(X_test)
print('Vocabulary created consisting of :', len(DocVector.get_feature_names()), "words\n\n", DocVector.get_feature_names())

Vocabulary created consisting of : 26180 words



In [7]:
type(DocVector)

sklearn.feature_extraction.text.CountVectorizer

In [8]:
sparseMat = pd.DataFrame(X_train_dtm.toarray(), columns=DocVector.get_feature_names())
print('Bag of words representation : \n', sparseMat)

Bag of words representation : 
       aa  aaa  aaas  aac  aadc  aaliyah  aaltra  aamir  aan  aaron  \
0      0    0     0    0     0        0       0      0    0      0   
1      0    0     0    0     0        0       0      0    0      0   
2      0    0     0    0     0        0       0      0    0      0   
3      0    0     0    0     0        0       0      0    0      0   
4      0    0     0    0     0        0       0      0    0      0   
5      0    0     0    0     0        0       0      0    0      0   
6      0    0     0    0     0        0       0      0    0      0   
7      0    0     0    0     0        0       0      0    0      0   
8      0    0     0    0     0        0       0      0    0      0   
9      0    0     0    0     0        0       0      0    0      0   
10     0    0     0    0     0        0       0      0    0      0   
11     0    0     0    0     0        0       0      0    0      0   
12     0    0     0    0     0        0       0      0    

In [9]:
MNBayes = MNB()
MNBayes = MNBayes.fit(X_train_dtm, y_train)

In [10]:
y_predict = MNBayes.predict(X_test_dtm)

In [11]:
print("Accuracy of algorithm = ", metrics.accuracy_score(y_test, y_predict)*100, "%")
print("Confusion matrix : ")
print(metrics.confusion_matrix(y_test, y_predict))
print("\n\nOther accuracy measures : ")
print(metrics.classification_report(y_test, y_predict))

Accuracy of algorithm =  96.69291338582677 %
Confusion matrix : 
[[117   1   1   0   0]
 [  4 101   4   0   0]
 [  0   0 120   0   0]
 [  5   1   5 135   0]
 [  0   0   0   0 141]]


Other accuracy measures : 
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       119
           1       0.98      0.93      0.95       109
           2       0.92      1.00      0.96       120
           3       1.00      0.92      0.96       146
           4       1.00      1.00      1.00       141

   micro avg       0.97      0.97      0.97       635
   macro avg       0.97      0.97      0.97       635
weighted avg       0.97      0.97      0.97       635



In [12]:
newdoc = pd.read_csv('../Documents/Datasets/sampledocs.txt', names=['text'])
newdoc

Unnamed: 0,text
0,chelsea sack mutu chelsea have sacked adrian ...
1,set your television to wow television started...
2,russian film wins bbc world prize russian dra...
3,markets signal brazilian recovery the brazili...
4,iraqis win death test case probe the family o...
5,apple ipod family expands market apple has ex...


In [13]:
newdoc_dtm = DocVector.transform(newdoc.text)
newpredict = MNBayes.predict(newdoc_dtm)
preddoc = list()
for i in range(len(newdoc)):
    preddoc.append([newdoc.text[i], revclassdict[newpredict[i]]])
pd.DataFrame(preddoc, columns=['text', 'predicted class'])

Unnamed: 0,text,predicted class
0,chelsea sack mutu chelsea have sacked adrian ...,sport
1,set your television to wow television started...,tech
2,russian film wins bbc world prize russian dra...,entertainment
3,markets signal brazilian recovery the brazili...,business
4,iraqis win death test case probe the family o...,politics
5,apple ipod family expands market apple has ex...,tech


In [2]:
117 / (117 + 2)

0.9831932773109243

In [3]:
 (117 + 101 + 120 + 135 + 141) / 635

0.9669291338582677