In [11]:
import numpy as np
import pandas as pd
import os
import operator#Has predefined functions for sorting dictionaries
from sklearn.metrics import classification_report,confusion_matrix
from string import punctuation

In [12]:
directory='20_newsgroups'#Storing the document name as a string 

In [13]:
ignore=[]  
ignore+=['subject:','from:', 'date:', 'newsgroups:', 'message-id:', 'lines:', 'path:', 'organization:', 
            'would', 'writes:', 'references:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
            'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution:', 'first', 
            'anyone','world', 'really', 'since', 'right', 'believe', 'still', 
            "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'"]#These words have no role in classification
ignore+=list(punctuation)

In [14]:
all_folders=sorted(os.listdir(os.path.join(directory)))
len(all_folders)

20

In [15]:
data={}#Creating a dictionary to store name of folder as key and the documents as values in form of a list 
for folder in all_folders:
    data[folder]=[]
    for file in os.listdir(os.path.join(directory,folder)):
        with open(os.path.join(directory,folder,file),encoding='latin-1') as opened_file:
            data[folder].append(opened_file.read())

In [16]:
vocabulary={}#Dictionary containing words and their frequencies
for i in range(len(data)): # For each key in newsgroup
    for doc in data[all_folders[i]]: # For each document corresponding to key in newsgroup)
        for word in doc.split(): # For each word in that document
            if word.lower() not in ignore and len(word.lower()) >= 5:
                if word.lower() not in vocabulary:
                    vocabulary[word.lower()]=1
                else:
                    vocabulary[word.lower()]+=1
len(vocabulary)

390232

In [17]:
sorted_vocabulary=sorted(vocabulary.items(),key=operator.itemgetter(1),reverse=True)#Sorting the Vocabulary dictionary on the basis of frequency

In [18]:
feature_list=[]#Choosing words from Vocabulary as features
for key in sorted_vocabulary:
    feature_list.append(key[0])
feature_list=feature_list[0:1000]#Choosing only the first 1000 words (K=1000)

In [None]:
news_groups=[] #List of newsgroups to be used for splitting the training and testing data 
for i in range(len(data)):
    for doc in data[all_folders[i]]:
        news_groups.append(all_folders[i])
news_groups=np.array(news_groups)

In [None]:
df = pd.DataFrame(columns = feature_list)
for folder in all_folders:
        for file in os.listdir(os.path.join(directory,folder)):# Add a new row for every file
            df.loc[len(df)] = np.zeros(len(feature_list))
        with open(os.path.join(directory,folder,file),encoding='latin-1') as opened_file:
            for word in opened_file.read().split():
                if word.lower() in feature_list:
                    df[word.lower()][len(df)-1] += 1
df

In [None]:
value=df.values#Values in df

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(value,news_groups,random_state=0)

In [None]:
from sklearn.naive_bayes import MultinomialNB #Sklearn module to implement Naive Bayes for multiple features
clf=MultinomialNB()
clf.fit(x_train,y_train)

In [None]:
print(clf.score(x_test,y_test))#Score on how the well the the Naive Bayes algorithmn has worked

In [None]:
#Naive Bayes from scratch
def fit(x_train,y_train):
    result={}#Empty dictionary to store values
    result["total_data"]=len(y_train)#Key total_data storing length of y_train
    class_=set(y_train)#Set of unique values in y_train
    for label in class_:
        result[label]={}
        row=(y_train==label)
        x_train_current=x_train[row]
        y_train_current=y_train[row]
        total_words=0
        for i in range(len(feature_list)):
            result[label][feature_list[i]]=x_train_current[:,i].sum()
            total_words+=x_train_current[:,i].sum()
        result[label]["total_count"]=total_words
    return result

In [None]:
def probability(x,dictionary,this_key):
    output=np.log(dictionary[this_key]["total_count"])-np.log(dictionary["total_data"])
    for i in range(len(feature_list)):
        num=dictionary[this_key][feature_list[i]]+1
        dem=dictionary[this_key]["total_count"]+len(feature_list)
        current_word_probability=np.log(num)-np.log(dem)#Implemented Laplace Correction as well
        for j in range(int(x[i])):
            output+=current_word_probability# If the frequency of word in test data point is zero then we wont consider it
    return output

In [None]:
def predictSingleClass(x,dictionary):
    best_class=-1000
    best_prob=-1000
    firstRun=True
    possible=dictionary.keys()
    for this_key in possible:
        if this_key=="total_data":
            continue
        this_key_probability=probability(x,dictionary,this_key)
        if(firstRun==True or this_key_probability>best_prob):
            best_class=this_key
            best_prob=this_key_probability
        firstRun=False
    return best_class

In [None]:
def predict(X_test,dictionary):
    Y_pred=[]
    num = 0
    for x in X_test:
        Y_pred.append(predictSingleClass(x,dictionary))
    return Y_pred

In [None]:
dictionary=fit(x_train,y_train)

In [None]:
y_pred=predict(x_test,dictionary)

In [None]:
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))