In [22]:
from nltk.corpus import stopwords
import os
import re
import nltk 
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
import sklearn
import pandas as pd
import time
import math

In [23]:
def get_filenames(rootDir):
# This function returns the filenames and categories of articles in the given directory

# categories will store the names of all type of articles
# documents_count stores the no. of documents in a particular category
# docslist stores the list of all documents
    categories =[]
    docslist = []
    dirnamelist = []

    for dirName, subdirList, fileList in os.walk(rootDir):

        if len(subdirList) >0:
            categories = subdirList

        if len(subdirList) == 0:
            docslist.append(fileList)
            dirnamelist.append(dirName)

    return categories,docslist

In [24]:
def get_clean_data(docslist,rootDir,categories):
# This function reads all the documents in the given directory and cleans the text data and generates tokens
# vocab_list stores the vocabulary of our dtaset    
# cateogory_docs stores the words per document per category
    vocab_list = []
    category_docs = []

    for index,dlist in enumerate(docslist):
        
        #docs_words stores the filtered words of documents in a category
        
        docs_words = []
        for doc in dlist:
            
            
            f=open('{0}/{1}/{2}'.format(rootDir,categories[index],doc),'r')

            wlist = f.read()
            
            #using regular expression to remove special and numeric characters from text document
            wlist = re.sub('[^A-Za-z]+', ' ', wlist)
            wlist = re.sub(r'\b\w{1,2}\b', '', wlist)
            wlist = re.sub(r'\w*\d\w*', '', wlist).strip()
            word_list = re.findall(r"[\w']+", wlist)
            tokens = [token.lower() for token in word_list]
            
            #removing stopwords from the tokens
            filtered_words = [word for word in tokens if word not in stopwords.words('english')]
            
            #removing words with length less than 3.
            for lword in filtered_words:
                if len(lword) < 3:
                    filtered_words.remove(lword)

            
            
            #appending unique words from the filtered list to our vocabulary list
            for uword in filtered_words:
                if uword not in vocab_list:
                    vocab_list.append(uword)

            docs_words.append(filtered_words)
        category_docs.append(docs_words)
        

    return vocab_list,category_docs



In [25]:
def frequency_dataset(vocab_list,categories,category_docs):
    # this functions returns a dataframe with frequency of each word in the vocabulary in all the documents
    # data stores the frequency data
    # target stores the category of each document
    # feature_frequency is a dictionary which stores the total frequency of every word in the vocabulary
    data=[]
    target=[]
    feature_frequency={}
    count=0
    for category_index in range(0,len(category_docs)):
        
        
        for doc in range(0,len(category_docs[category_index])):
            target.append(categories[category_index])
            data.append(list(np.zeros(len(vocab_list),int)))
            
            for word in category_docs[category_index][doc]:
                
                index=vocab_list.index(word)
                data[count][index]+=1
                if word in feature_frequency.keys():
                    feature_frequency[word]+=1
                else:
                    feature_frequency[word]=1
            count+=1
    data_df=pd.DataFrame(data)
    data_df.columns=vocab_list
    return data_df,target,feature_frequency  

In [26]:
def eliminate_features(data_df,feature_frequency,vocab_list):
    #this function eliminates words and their frequency data from the vocabulary_list and data_df with small overall frequencies
    # and takes top 1000 words as final dataset
    new=sorted(feature_frequency.items(), key=lambda t:t[1], reverse=True)
    feature_frequency=new[:1000]
    final_data=pd.DataFrame()
    final_vocab=[]
    for i in dict(feature_frequency).keys():
        final_data[i]=data_df[i]
        final_vocab.append(i)
    return final_data,final_vocab
    

In [27]:
def fit(X_train,Y_train):#data to be passed as dataframe
    # this function returns a dictionary with category-wise the overall frequency of each word in vocabulary
    result={}
    classes=set(Y_train[0])
    for class_ in classes:
        result[class_]={}
        result["total-docs"]=len(Y_train[0])
        for feature in X_train.columns:
            result[class_][feature]=X_train[feature][Y_train[0]==class_].sum()
        result[class_]["total_words"]=sum(result[class_].values())
        result[class_]["class_frequency"]=len(Y_train[Y_train[0]==class_])
    return result

In [28]:
def probability(dictionary, x, current_class):
    output=np.log(dictionary[current_class]["class_frequency"])-np.log(dictionary["total-docs"])
    
    num_features = len(list(dictionary[current_class].keys())[:1000])
    for j in range(0, num_features):
        xj = x[j]
        if xj!=0:
            count_current_class_with_value_xj = dictionary[current_class][list((dictionary[current_class].keys()))[j]] + 1
            count_current_class = dictionary[current_class]["total_words"] + len(dictionary[current_class].keys())
            current_xj_probablity = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
            output = output + current_xj_probablity
    return output

In [29]:
def predictSinglePoint(dictionary, x):
    #this function predicts the category of a single document
    #best_prob is the maximum prbability out of all the probabilities of the classes to be the category of the document.
    #best_class is the best prediction for the document(decided on the basis of best_probability)
    
    classes = list(dictionary.keys())
    classes.remove('total-docs')
    best_prob = -math.inf
    best_class = -1
    first_run = True
    for current_class in classes:
        #p_current_class stores the probability of current class
        p_current_class = probability(dictionary, x, current_class)
        
        if (first_run or p_current_class > best_prob):
            best_prob = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [30]:
def predict(dictionary,X_test):
    #this functions predicts the category of the documents passed in the dataset
    y_pred=[]
    x_t=np.array(X_test)
    i=0
    for x in x_t:
        x_class=predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
        i+=1
    return y_pred

In [31]:
start=time.time()
categories ,docslist=get_filenames(r"C:\Users\nEW u\ML_A\Machine Learning\Naive Bayes\Text Classification\mini_newsgroups")
end=time.time()
print("time taken:  ",end-start)

time taken:   0.5505285263061523


In [32]:
start=time.time()
vocab_list,category_docs=get_clean_data(docslist,r"C:\Users\nEW u\ML_A\Machine Learning\Naive Bayes\Text Classification\mini_newsgroups",categories)
end=time.time()
print("time taken:  ",end-start)

time taken:   393.06668996810913


In [33]:
start=time.time()
data_df,target,feature_frequency=frequency_dataset(vocab_list,categories,category_docs)
end=time.time()
print("time taken:  ",end-start)

time taken:   578.745813369751


In [34]:
start=time.time()
final_data,final_vocab=eliminate_features(data_df,feature_frequency,vocab_list)
end=time.time()
print("time taken:  ",end-start)

time taken:   0.6435110569000244


In [35]:
from sklearn import model_selection
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(final_data,pd.DataFrame(target),test_size=0.2,random_state=0)

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
#Predicting the test data using sklearn Multinomial Naive Base
clf=MultinomialNB()
clf.fit(X_train,Y_train[0])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
Y_pred=clf.predict(X_test)

In [39]:
#calculating the score
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)

0.7725

In [40]:
#fiting the dataset on the implemented Multinomial Naive Baeyes function
start=time.time()
dictionary=fit(X_train,Y_train)
end=time.time()
print("time taken: ",end-start)

time taken:  13.503151416778564


In [41]:
#predicting for X_test
start=time.time()
y_pred=predict(dictionary,X_test)
end=time.time()
print("time taken: ",end-start)

time taken:  12.204582214355469


In [42]:
#score for prediction by implemented MultinomialNB
accuracy_score(Y_test,y_pred)

0.7725

# COMPARISON:
The accuracy score of the in-built sklearn algorithm and my implementation is exactly the same, i.e, 0.7725