In [1]:
import numpy as np
import pandas as pd
import operator
import os,sys
from sklearn import model_selection
from sklearn import datasets
import re,string

In [2]:
#stop_words in common.
stop_word=["a","about","above","after","again","against","all","am","an","and","any","are","as","at","be","because","been","before","being","below","between","both","but",
"by","could","did","do","does","doing","down","during","each","few","for","from","further","had","has","have","having","he","he'd","he'll","he's","her",
"here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","it","it's","its","itself","let's","me",
"more","most","my","myself","nor","of","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","she",
"she'd","she'll","she's","should","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's",
"these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","we","we'd",
"we'll","we're","we've","were","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with",
"would","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]

In [3]:
#X is a list further made in form of tuple , where first element is name of document and second is the text in documents.
#Y is the category
X  =[] 
Y = []
for category in os.listdir("/Users/neelamagarwal/desktop/Naive Bayes/20_newsgroups"):
    for document in os.listdir("/Users/neelamagarwal/desktop/Naive Bayes/20_newsgroups/"+category):
        with open("/Users/neelamagarwal/desktop/Naive Bayes/20_newsgroups/"+category+'/'+document, "r",encoding = "utf-8",errors = "ignore") as f:
            X.append((document,f.read()))
            Y.append(category)

In [4]:
#splitting the data in training and testing
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=model_selection.train_test_split(X,Y,random_state = 1)

In [5]:
#Making Dictionary of words with their corresponding frequency
dic={}
for i in range(len(x_train)):
    #Took [1] because [0] is name of doc and [1] is text in doc
    word=x_train[i][1].lower()
    #splitting the text into words
    stripped=re.split(r'\W+',word)
    #Iterating over each word
    for s in stripped:
        #we will not include stop_words, alpha-numerics, punctuations or irrelevant word of length less than 2 in our dictionary
        if not(s.isalpha()) or s in stop_word or len(s)<=2:
            continue
        if s in dic:
            dic[s]+=1
        else:
            dic[s]=1

In [6]:
#Sorting the dictionary on basis of frequency of words in descending order
sorted_dic = sorted(dic.items(), key=operator.itemgetter(1),reverse=True)

In [7]:
#taking top 2000 words with max freuqency as our feature
features=[sorted_dic[i][0] for i in range(2000)]

In [9]:
#Making x_train dataset
#No. of rows is equivalent to rows in x_train, and column is equal to length of features
x_train_dataset=np.zeros([len(x_train),len(features)],int)
for i in range(len(x_train)):
    words=x_train[i][1].lower()
    word=re.split(r'\W+',words)
    #Iterating over each word
    for j in word:
        #We will add the frequency corresponding to that word only which is in our feature list
        if j in features:
            x_train_dataset[i][features.index(j)]+=1

In [10]:
#Making x_test dataset
#No. of rows is equivalent to rows in x_test, and column is equal to length of feature list
x_test_dataset=np.zeros([len(x_test),len(features)],int)
for i in range(len(x_test)):
    words=x_test[i][1].lower()
    word=re.split(r'\W+',words)
    #Iterating over each word
    for j in word:
        #We will add the frequency corresponding to that word only which is in our feature list
        if j in features:
            x_test_dataset[i][features.index(j)]+=1

In [11]:
# Now that our Dataset is ready
# we will create model for Naive Bayes

In [24]:
#Making dictionary for implementing Naive Baye's
def fit(x_train_dataset,y_train):
    count={}
    total_word=0
    y_train=np.array(y_train)
    #Total no. of document is calculated
    count["total_doc"]=len(y_train)
    classes=set(y_train)
    for i in classes:
        temp=0
        #selecting x_train corresponding to class present in y_train
        x_train_with_i=x_train_dataset[y_train==i]
        #finding length of data with category corresponding to i 
        temp2=x_train_with_i.shape[0]
        count[i]={}
        #Iterating over answer1(actual feature list)
        for feature in features:
            #Calculating total word in feature
            l=(x_train_with_i[:,features.index(feature)]).sum()
            count[i][feature]=l
            temp+=l
        #Total word in that class
        count[i]["word_in_class"]=temp
        #Length of data with y_train belonging to specific class
        count[i]["length"]=temp2
        
    
    return count

In [13]:
def probability(x_test,dic,classes):
    prob=np.log(dic[classes]["length"])-np.log(dic["total_doc"])
    feature=list(dic[classes].keys())
    #-2 is done becuase there will be "length" and "word in class" present in feature. 
    for j in range (len(feature)-2):
        xj=x_test[j]
        #If frequency is 0, we will not consider it
        if xj==0:
            current_prob=0
        else:
            #Extra addition part is Laplace correction
            num=dic[classes][feature[j]]+1
            den=dic[classes]["word_in_class"]+len(dic[classes].keys())-2
            current_prob=np.log(num)-np.log(den)
        prob+=current_prob
    return prob

In [14]:
#Best_class or probable answer will be returned from here
def predict_for_single(x_test,dic):
    first_run=True
    classes=dic.keys()
    for i in classes:
        if i=="total_doc":
            continue
        prob=probability(x_test,dic,i)
        if first_run or prob>best_prob:
            best_prob=prob
            first_run=False
            best_class=i
    return best_class

In [26]:
def predict(x_test,dic):
    y_pred=[]
    for x in x_test:
        y_pred.append(predict_for_single(x,dic))
    return y_pred

In [16]:
def score(y_test,y_pred):
        count = 0
        for i in range(len(y_pred)):
            if y_pred[i] == y_test[i]:
                count+=1
        return count/len(y_pred)

In [17]:
# First we find prediction by inbuilt Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
clf1 = MultinomialNB()
clf1.fit(x_train_dataset,y_train)
y_pred_inbuilt = clf1.predict(x_test_dataset)

In [20]:
from sklearn.metrics import confusion_matrix , classification_report

In [34]:
#results of inbuilt naive bayes model
print("Score on testing data:",clf1.score(x_test_dataset,y_test))
print(confusion_matrix(y_test,y_pred_inbuilt))
print(classification_report(y_test,y_pred_inbuilt))

Score on testing data: 0.8554
[[202   0   0   0   0   0   2   0   3   0   0   1   1   0   1   2   0   1
    0  47]
 [  0 170  12  18  12   7  12   4   0   0   0   0   2   1   1   0   0   0
    0   0]
 [  0   3 212  23   6  11   9   1   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   5   6 180  38   0   5   1   1   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0  21 225   0   5   0   0   0   0   0   2   0   0   0   0   0
    0   0]
 [  0  17  31   5   4 181   2   0   3   0   1   0   2   0   2   0   0   0
    0   0]
 [  0   0   0   2   2   0 217   3   1   0   0   0   3   0   0   0   0   0
    0   0]
 [  0   1   0   0   2   0  10 225   7   0   1   0   1   0   0   0   1   0
    0   0]
 [  0   0   0   0   0   0   7   5 233   1   0   0   1   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   3   3   5 244  11   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   2   3   1  12 234   0   0   0   0   0   0   0
    0   0]
 [  1   4   0   0   0   1   1   0  

In [29]:
# Now prediction by our model
dictionary = fit(x_train_dataset,y_train)
y_pred_self = predict(x_test_dataset,dictionary)

In [33]:
# Results of our model
print("Score on testing data:",score(y_test,y_pred_self))
print(confusion_matrix(y_test,y_pred_self))
print(classification_report(y_test,y_pred_self))

Score on testing data: 0.8742
[[220   0   0   0   0   0   0   0   1   0   0   1   0   1   1   0   0   1
    0  35]
 [  0 194   7  13   5   4  11   1   0   0   0   0   0   4   0   0   0   0
    0   0]
 [  0   7 192  31   5  18   6   0   0   0   0   2   1   0   0   0   0   0
    3   0]
 [  0   5   2 192  31   0   5   0   0   0   0   0   1   0   0   0   0   0
    0   0]
 [  0   1   0  12 236   0   2   0   0   0   0   0   2   0   0   0   0   0
    0   0]
 [  0  20  25   4   1 193   2   0   1   0   0   0   1   1   0   0   0   0
    0   0]
 [  0   0   0   2   1   0 214   5   0   0   0   0   6   0   0   0   0   0
    0   0]
 [  0   1   0   0   2   0   9 231   2   0   0   0   2   0   0   0   1   0
    0   0]
 [  0   0   0   0   0   0   5   3 239   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   1   0   1 256   8   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   3   1   0   5 243   0   0   0   0   0   0   0
    0   0]
 [  0   3   1   0   0   0   0   1  

### So score by our model = 0.8742
### score by inbuilt model = 0.8554

# So Our model is doing pretty well
