In [0]:
#importing modules
import os
import codecs
import numpy as np
import string
import pandas as pd

# **Data Preprocessing**

In [0]:
#downloading and extracting the files on colab server
import urllib.request
urllib.request.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz", "a.tar.gz")
import tarfile
tar = tarfile.open("a.tar.gz")
tar.extractall()
tar.close()

In [3]:
#making a list of all the file paths and their corresponding class
f_paths=[]
i=-1
path="20_newsgroups"
folderlist=os.listdir(path)
if ".DS_Store" in folderlist:
  folderlist.remove('.DS_Store')
for folder in folderlist:
  i+=1
  filelist=os.listdir(path+'/'+folder)
  for file in filelist:
    f_paths.append((path+'/'+folder+'/'+file,i))
len(f_paths)

19997

In [4]:
#splitting the list of paths into training and testing data
from sklearn import model_selection
x_train,x_test=model_selection.train_test_split(f_paths)
len(x_train),len(x_test)

(14997, 5000)

In [5]:
#Making the lists X_train and X_test containg only the paths of the files in training and testing data
#First making lists Y_train and Y_test containing the classes of the training and testing data
X_train=[]
X_test=[]
Y_train=[]
Y_test=[]
for i in range(len(x_train)):
  X_train.append(x_train[i][0])
  Y_train.append(x_train[i][1])
for i in range(len(x_test)):
  X_test.append(x_test[i][0])
  Y_test.append(x_test[i][1])
#Transforming Y_train and Y_test into 1 dimensional np arrays
Y_train=(np.array([Y_train])).reshape(-1)
Y_test=(np.array([Y_test])).reshape(-1)
#shape of Y_train and Y_test np arrays
Y_train.shape,Y_test.shape

((14997,), (5000,))

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#adding all the above lists and including punctuations to stop words
stop_words=list(stop)+list(set(string.punctuation))
len(stop_words)

211

In [0]:
#making vocabulary from the files in X_train i.e. training data
vocab={}
count =0
for filename in X_train:
  count+=1
  f = open(filename,'r',errors='ignore')
  record=f.read()
  words=record.split()
  for word in words:
    if len(word)>2:
      if word.lower() not in stop_words:
        if word.lower() in vocab:
          vocab[word.lower()]+=1
        else:
          vocab[word.lower()]=1
  f.close()

In [9]:
#length of the vocabulary
len(vocab)

354297

In [0]:
#sorting the vocabulary on the basis of the frequency of the word
#making the sorted vocabulary
import operator
sorted_vocab = sorted(vocab.items(), key= operator.itemgetter(1), reverse= True)   # sort the vocab based on frequency

In [0]:
#making the list feature_names containg the words with the frequency of the top 2000 words
feature_names = []
for i in range(len(sorted_vocab)):
    if(sorted_vocab[2000][1] <= sorted_vocab[i][1]):
        feature_names.append(sorted_vocab[i][0])

In [12]:
#length of the feature_names i.e. number of our features
print(len(feature_names))

2008


In [0]:
#making dataframes df_train and df_test with columns having the feature names i.e. the words
df_train=pd.DataFrame(columns=feature_names)
df_test=pd.DataFrame(columns=feature_names)

In [14]:
count_train,count_test=0,0

#transforming each file in X_train into a row in the dataframe df_train having columns as feature names and values as the frequency of that feature name i.e that word
for filename in X_train:
  count_train+=1
  #adding a row of zeros for each file
  df_train.loc[len(df_train)]=np.zeros(len(feature_names))
  f = open(filename,'r',errors='ignore')
  record=f.read()
  words=record.split()
  #parsing through all the words of the file
  for word in words:
    if word.lower() in df_train.columns:
      df_train[word.lower()][len(df_train)-1]+=1 #if the word is in the column names then adding 1 to the frequency of that word in the row
  f.close()
  
#transforming each file in X_test into a row in the dataframe df_test having columns as feature names and values as the frequency of that feature name i.e that word  
for filename in X_test:
  count_test+=1
  #adding a row of zeros for each file
  df_test.loc[len(df_test)]=np.zeros(len(feature_names))
  f = open(filename,'r',errors='ignore')
  record=f.read()
  words=record.split()
  #parsing through all the words of the file
  for word in words:
    if word.lower() in df_test.columns:
      df_test[word.lower()][len(df_test)-1]+=1 #if the word is in the column names then adding 1 to the frequency of that word in the row
  f.close()
  
#printing the number files tranformed in training and testing data
print(count_train,count_test)

14997 5000


In [0]:
#putting the values of the datafames into X_train and X_test
X_train=df_train.values
X_test=df_test.values

# **Using the inbuilt Multinomial Naive Bayes classifier from sklearn**

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
clf=MultinomialNB()
#fitting the classifier on training data
clf.fit(X_train,Y_train)
#prediciting the classes of the testing data
Y_pred=clf.predict(X_test)
#classification report
print(classification_report(Y_test,Y_pred))
#testing score
print("Testing: ",clf.score(X_test,Y_test))

              precision    recall  f1-score   support

           0       0.90      0.80      0.85       243
           1       0.79      0.71      0.75       256
           2       0.95      0.87      0.90       260
           3       0.95      0.99      0.97       231
           4       0.95      0.75      0.84       253
           5       0.94      0.95      0.95       233
           6       0.65      0.51      0.57       252
           7       0.82      0.89      0.85       225
           8       0.87      0.80      0.84       241
           9       0.81      0.91      0.86       251
          10       0.83      0.80      0.81       245
          11       0.98      0.94      0.96       261
          12       0.75      0.79      0.77       268
          13       0.70      0.85      0.77       254
          14       0.83      0.81      0.82       242
          15       0.90      0.84      0.87       275
          16       0.71      0.54      0.61       235
          17       0.87    

# **Self implemented Multinomial Naive Bayes**

In [0]:
#makes the nested dictionary required for NB using the training data
def fit(X,Y):
    dictionary={}
    y_classes=set(Y)
    #iterating over each class of y
    for y_class in y_classes:
        #adding the class as a key to the dictionary
        dictionary[y_class]={}
        n_features=X.shape[1]
        rows=(Y==y_class)
        #making the arrays having only those rows where class is y_class
        X_y_class=X[rows]
        Y_y_class=Y[rows]
        #adding the total number of files as total_data
        dictionary["total_data"]=X.shape[0]
        #iterating over each feature
        for i in range(n_features):
            #adding the feature as a key which has the count of that word in Y=y_class as its value
            dictionary[y_class][i]=X_y_class[:,i].sum()
            #adding the total number of files as total_class
            dictionary[y_class]["total_class"]=X_y_class.shape[0]
            #adding the sum of all the words in Y=y_class i.e. total no. of words in Y=y_class
            dictionary[y_class]["total_words"]=X_y_class.sum()
    return dictionary

In [0]:
#calculates the probability of the feature vector belonging to a particular class and the probability of the class
#returns the product of the above 2 probabilities
def probability(x,dictionary,y_class):
    #output intially has probability of the particular class in log terms
    output=np.log(dictionary[y_class]["total_class"])-np.log(dictionary["total_data"])
    n_features=len(dictionary[y_class].keys())-2
    #calculates probability of x being in a particular class by calulating probability of each word being in that class
    for i in range(n_features):
        if x[i]>0:
            #probability of the ith word being in this class in terms of log
            p_i=x[i]*(np.log(dictionary[y_class][i] + 1) - np.log(dictionary[y_class]["total_words"]+n_features))
            output+=p_i
    return output

In [0]:
#predicts the class to which a single file feature vector belongs to
def predictSinglePoint(x,dictionary):
    classes=dictionary.keys()
    #contains the class having the max probability
    best_class=1
    #max probability
    best_prob=-1000
    first=True
    #iterating over all the classes
    for y_class in classes:
        if y_class=="total_data":
            continue
        #finding probability of this file feature vector belonging to y_class
        p_class=probability(x,dictionary,y_class)
        if(first or p_class>best_prob):
            best_prob=p_class
            best_class=y_class
        first=False
    return best_class

In [0]:
#predicts the classes to which all the file feature vectors belong in the testing data
def predict(X_test,dictionary):
    y_pred=[]
    #iterates over all the file feature vectors
    for x in X_test:
        #predicts the class of a particular file feature vector
        x_class=predictSinglePoint(x,dictionary)
        y_pred.append(x_class)
    return y_pred

In [21]:
dictionary=fit(X_train,Y_train) #makes the required dictionary
y_pred=predict(X_test,dictionary)# predicts the classes
print(classification_report(Y_test,y_pred)) #classification report for testing data

              precision    recall  f1-score   support

           0       0.90      0.80      0.85       243
           1       0.79      0.71      0.75       256
           2       0.95      0.87      0.90       260
           3       0.95      0.99      0.97       231
           4       0.95      0.75      0.84       253
           5       0.94      0.95      0.95       233
           6       0.65      0.51      0.57       252
           7       0.82      0.89      0.85       225
           8       0.87      0.80      0.84       241
           9       0.81      0.91      0.86       251
          10       0.83      0.80      0.81       245
          11       0.98      0.94      0.96       261
          12       0.75      0.79      0.77       268
          13       0.70      0.85      0.77       254
          14       0.83      0.81      0.82       242
          15       0.90      0.84      0.87       275
          16       0.71      0.54      0.61       235
          17       0.87    

# **Comparison of results between inbuilt and self implemented Multinomial NB**

In [22]:
print("----------------------------------------------------------------------------")
print("Classification report for inbuilt Multinomial NB on testing data: ")
print("----------------------------------------------------------------------------")
print(classification_report(Y_test,Y_pred))
print("----------------------------------------------------------------------------")
print("Classification report for self implemented Multinomial NB on testing data: ")
print("----------------------------------------------------------------------------")
print(classification_report(Y_test,y_pred))

----------------------------------------------------------------------------
Classification report for inbuilt Multinomial NB on testing data: 
----------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.80      0.85       243
           1       0.79      0.71      0.75       256
           2       0.95      0.87      0.90       260
           3       0.95      0.99      0.97       231
           4       0.95      0.75      0.84       253
           5       0.94      0.95      0.95       233
           6       0.65      0.51      0.57       252
           7       0.82      0.89      0.85       225
           8       0.87      0.80      0.84       241
           9       0.81      0.91      0.86       251
          10       0.83      0.80      0.81       245
          11       0.98      0.94      0.96       261
          12       0.75      0.79      0.77       268
          13       0.7