In [14]:
import os
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
## Provide path of the folder where all news groups are present
path = "20_newsgroups"
data={} # data is a dictionary of the form { folder1 : [doc1,doc2,....,doc1000] , folder2 : [doc1,doc2,doc3,....] }
groups = os.listdir(path)
## Bulding the dictionary of documents
for group in groups:
    data[group]=[]
    files = os.listdir(os.path.join(path,group))
    for file in files:
        with open(os.path.join(path,group,file)) as opened_file:
            data[group].append(opened_file.read())

In [17]:
## Creating the Y_Label column
Y = []
for group in groups:
    for i in range(1000):
        Y.append(group)
Y = np.array(Y)

In [18]:
def remove_punctuation(txt):
    new_txt = "".join([c for c in txt  if c not in string.punctuation])
    return new_txt

dicty = {}
vocabulary = set()
set_of_stop_words = set(stopwords.words("english"))
irrelevant_words = ["the", "from", "i","lines","subject", "organization", "in" ,"would", "this","it","dont","also", ]

## Creating word:frequency dictionary
for group in [*data.keys()]:
    for doc in data[group]:    
        doc = remove_punctuation(doc)  ## Removing the punctuation from the documents
        tokenized_words = word_tokenize(doc)  ## Tokenizing the document   
        tokenized_words_excluding_stop_words_list = []
        for w in tokenized_words:
            if (w.lower() not in set_of_stop_words) and (w.lower() not in irrelevant_words) :
                dicty[w.lower()] = dicty.get(w.lower(),0) + 1
    dicty = dict(sorted(dicty.items(),key=lambda x:x[1],reverse = True))
dct = dict(list(dicty.items())[:5000]) ## Selecting the top 5000 words for building the vocabulary
features = [*dct.keys()]
#dicty

In [19]:
## Building the vocabulary of top 5000 words
X = np.zeros([20000,len(features)], dtype = int) 
## Filling the X_train matrix
i = 0
for group in [*data.keys()]:
    for j in range(len(data[group])): 
        doc = data[group][j]
        doc = remove_punctuation(doc)
        tokenized_words = word_tokenize(doc)
        for word in tokenized_words:
            if word in features:
                index = features.index(word)
                X[i][index] += 1
        i += 1       

In [None]:
## Saving the X dataset to an external csv file for further usage
np.savetxt("text_classification_data_X.csv", X,fmt='%d')

In [None]:
## Loading the X data from the saved file
X = np.genfromtxt('text_classification_data_X.csv', delimiter=' ')

In [20]:
## Splitting the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)

## Using inbuilt sklearn MultinomialNB

In [24]:
clf = MultinomialNB()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)

print("Accuracy score : ", clf.score(X_test,Y_test))
print("Confusion Matrix : ",confusion_matrix(Y_test, Y_pred))
print("Classification Report : ",classification_report(Y_test, Y_pred))

Accuracy score :  0.8375
Confusion Matrix :  [[199   1   0   0   0   0   1   1   4   0   1   1   0   1   1   7   2   2
    8  45]
 [  0 242  25   4   8   5   5   1   2   0   0   1   1   2   3   0   0   0
    1   1]
 [  0  12 259  14   2  11   3   0   0   1   0   1   2   0   1   0   0   0
    2   0]
 [  0   3   7 246  15   0   7   2   0   0   0   0   4   0   0   0   0   0
    0   0]
 [  0   0   7  15 261   0   9   0   0   0   0   0   4   0   0   0   0   0
    0   1]
 [  0  15   9   3   3 258   2   0   2   0   1   0   1   0   0   0   0   0
    1   0]
 [  0   1   1   7   6   0 262  11   4   2   1   3   5   1   5   0   0   1
    0   1]
 [  0   3   0   0   0   1   9 292   5   0   2   0   2   1   2   1   4   0
    0   2]
 [  1   1   0   0   2   0   8   2 285   0   0   0   1   1   2   0   1   0
    2   0]
 [  2   3   0   0   0   0   0   1   1 274   5   0   0   1   0   1   0   0
    2   0]
 [  0   1   1   0   0   0   1   1   3  14 257   0   1   0   0   0   1   0
    2   1]
 [  0   4   1   3   

## MultinomialNB code from scratch

In [25]:
## Fitting the model
def fit(X_train, Y_train):
    result = {}
    result["total_rows_count"] = len(Y_train)
    num_of_features = X_train.shape[1]
    class_labels = set(Y_train)
    for current_class_label in class_labels:
        result[current_class_label] = {}
        current_class_rows = (Y_train == current_class_label)
        X_train_current = X_train[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        result[current_class_label]["current_class_rows_count"] = len(Y_train_current)
        result[current_class_label]["total_words_in_current_class"] = np.sum(X_train_current)        
        for i in range(num_of_features):
            result[current_class_label][i] = (X_train_current[:,i]).sum()
    return result

## Predicting the output class on the testing data
def probability(dictionary, x, current_class):
    #prior_prob_of_current_class = dictionary[current_class]["current_class_rows_count"]/dictionary["total_rows_count"]
    prior_prob_of_current_class = np.log(dictionary[current_class]["total_words_in_current_class"]) - np.log(dictionary["total_rows_count"])
    final_prob = prior_prob_of_current_class
    num_of_features = len(dictionary[current_class].keys()) - 2  ## -2 because it also contains "current_class_rows_count" & "total_words_in_current_classalso
    for i in range(num_of_features):        
        ## Probability with Laplace correction and then Taking log of that
        a = dictionary[current_class][i] + 1
        b = dictionary[current_class]["total_words_in_current_class"] + num_of_features
        final_prob += (x[i]*(np.log(a) - np.log(b)))  ## a = count of "word" in the current class & b = total count of all words in the current class
    return final_prob


def predict_prob_single_point(dictionary , x):
    final_ans = -9999999
    best_class = ""
    first_run = True
    for current_class in [*dictionary.keys()]:
        if(current_class == "total_rows_count"):
            continue
        predicted_ans = probability(dictionary, x, current_class)
        if(first_run or predicted_ans > final_ans):
            final_ans = predicted_ans
            best_class = current_class
        first_run = False
    return best_class

def predict(dictionary, X_test):
    Y_pred = []
    for x in X_test:
        predicted_class = predict_prob_single_point(dictionary, x)
        Y_pred.append(predicted_class)
    return Y_pred

In [26]:
dictionary = fit(X_train, np.array(Y_train))
Y_pred = predict(dictionary, X_test)
#print(Y_pred)
print("Confusion Matrix : ",confusion_matrix(Y_test, Y_pred))
print("Classification Report : ",classification_report(Y_test, Y_pred))

Confusion Matrix :  [[200   1   0   0   0   0   0   1   4   0   1   1   0   1   1   7   2   2
    8  45]
 [  0 241  24   4   8   7   5   0   2   0   0   1   1   2   4   0   0   0
    1   1]
 [  0  13 255  14   2  13   3   0   0   1   0   2   2   0   1   0   0   0
    2   0]
 [  0   3   7 246  15   0   7   2   0   0   0   0   4   0   0   0   0   0
    0   0]
 [  0   0   7  16 259   0   9   0   0   0   0   0   5   0   0   0   0   0
    0   1]
 [  0  15   8   3   3 259   1   0   2   0   1   0   2   0   0   0   0   0
    1   0]
 [  0   1   1   7   7   0 261  11   4   2   1   3   5   1   5   0   0   1
    0   1]
 [  0   3   0   0   0   1   8 292   5   0   3   0   2   1   2   1   4   0
    0   2]
 [  1   1   0   0   2   0   7   2 285   1   0   0   1   1   1   0   1   0
    3   0]
 [  2   3   0   0   0   0   0   1   1 274   5   0   0   1   0   1   0   0
    2   0]
 [  0   1   1   0   0   0   1   1   2  13 259   0   1   0   0   0   1   0
    2   1]
 [  0   4   1   3   2   2   0   1   0   1   1

## Comparison of inbuilt sklearn MultinomialNB & my code from scratch

### Inbuilt Classifier
Accuracy = 84%

Confusion Matrix :  [[199   1   0   0   0   0   1   1   4   0   1   1   0   1   1   7   2   2
    8  45]
 [  0 242  25   4   8   5   5   1   2   0   0   1   1   2   3   0   0   0
    1   1]
 [  0  12 259  14   2  11   3   0   0   1   0   1   2   0   1   0   0   0
    2   0]
 [  0   3   7 246  15   0   7   2   0   0   0   0   4   0   0   0   0   0
    0   0]
 [  0   0   7  15 261   0   9   0   0   0   0   0   4   0   0   0   0   0
    0   1]
 [  0  15   9   3   3 258   2   0   2   0   1   0   1   0   0   0   0   0
    1   0]
 [  0   1   1   7   6   0 262  11   4   2   1   3   5   1   5   0   0   1
    0   1]
 [  0   3   0   0   0   1   9 292   5   0   2   0   2   1   2   1   4   0
    0   2]
 [  1   1   0   0   2   0   8   2 285   0   0   0   1   1   2   0   1   0
    2   0]
 [  2   3   0   0   0   0   0   1   1 274   5   0   0   1   0   1   0   0
    2   0]
 [  0   1   1   0   0   0   1   1   3  14 257   0   1   0   0   0   1   0
    2   1]
 [  0   4   1   3   2   2   0   1   0   1   1 256   1   2   1   0   2   0
    5   1]
 [  0  14   5   8   8   1   3   9   2   0   0   1 262   0   0   0   0   0
    0   0]
 [  2   6   1   3   0   0   1   3   3   0   0   0   4 269   4   0   0   1
    3   0]
 [  3   7   1   0   1   0   2   1   2   0   0   1   2   4 256   0   1   1
    8   6]
 [  3   0   0   1   1   0   1   0   0   0   0   0   0   1   1 301   0   0
    2   0]
 [  0   3   0   1   1   0   2   1   2   0   0   2   0   1   1   1 257   3
   14   3]
 [  7   1   1   1   0   0   8   1   2   1   0   0   0   1   1   2   5 287
   20   5]
 [  8   1   2   0   0   0   2   0   1   6   0   2   0   4   7   5  33  19
  169  32]
 [ 79   2   0   1   1   0   3   0   3   1   2   0   0   2   2  17  29   1
   22 133]]
Classification Report :                            precision    recall  f1-score   support

             alt.atheism       0.65      0.73      0.69       274
           comp.graphics       0.76      0.80      0.78       301
 comp.os.ms-windows.misc       0.81      0.84      0.82       308
comp.sys.ibm.pc.hardware       0.80      0.87      0.83       284
   comp.sys.mac.hardware       0.84      0.88      0.86       297
          comp.windows.x       0.93      0.87      0.90       295
            misc.forsale       0.80      0.84      0.82       311
               rec.autos       0.89      0.90      0.90       324
         rec.motorcycles       0.89      0.93      0.91       306
      rec.sport.baseball       0.91      0.94      0.93       290
        rec.sport.hockey       0.95      0.91      0.93       283
               sci.crypt       0.96      0.90      0.93       283
         sci.electronics       0.90      0.84      0.87       313
                 sci.med       0.92      0.90      0.91       300
               sci.space       0.89      0.86      0.88       296
  soc.religion.christian       0.90      0.97      0.93       311
      talk.politics.guns       0.77      0.88      0.82       292
   talk.politics.mideast       0.91      0.84      0.87       343
      talk.politics.misc       0.65      0.58      0.61       291
      talk.religion.misc       0.58      0.45      0.50       298

                accuracy                           0.84      6000
               macro avg       0.84      0.84      0.83      6000
            weighted avg       0.84      0.84      0.84      6000


### Code from scratch
Accuracy = 84%

Confusion Matrix :  [[200   1   0   0   0   0   0   1   4   0   1   1   0   1   1   7   2   2
    8  45]
 [  0 241  24   4   8   7   5   0   2   0   0   1   1   2   4   0   0   0
    1   1]
 [  0  13 255  14   2  13   3   0   0   1   0   2   2   0   1   0   0   0
    2   0]
 [  0   3   7 246  15   0   7   2   0   0   0   0   4   0   0   0   0   0
    0   0]
 [  0   0   7  16 259   0   9   0   0   0   0   0   5   0   0   0   0   0
    0   1]
 [  0  15   8   3   3 259   1   0   2   0   1   0   2   0   0   0   0   0
    1   0]
 [  0   1   1   7   7   0 261  11   4   2   1   3   5   1   5   0   0   1
    0   1]
 [  0   3   0   0   0   1   8 292   5   0   3   0   2   1   2   1   4   0
    0   2]
 [  1   1   0   0   2   0   7   2 285   1   0   0   1   1   1   0   1   0
    3   0]
 [  2   3   0   0   0   0   0   1   1 274   5   0   0   1   0   1   0   0
    2   0]
 [  0   1   1   0   0   0   1   1   2  13 259   0   1   0   0   0   1   0
    2   1]
 [  0   4   1   3   2   2   0   1   0   1   1 257   1   2   0   0   2   0
    6   0]
 [  0  14   5   8   8   1   3   9   2   0   0   1 262   0   0   0   0   0
    0   0]
 [  2   6   1   3   0   0   1   3   3   0   0   0   2 271   4   0   0   1
    3   0]
 [  3   7   1   0   1   0   2   1   2   0   0   2   1   4 256   0   1   1
    8   6]
 [  3   0   0   0   1   0   1   0   0   0   0   0   0   1   1 302   0   0
    2   0]
 [  0   4   0   0   1   0   2   1   1   0   0   2   0   1   1   1 256   3
   16   3]
 [  7   1   2   1   0   0   4   1   2   1   0   0   0   1   1   2   4 290
   20   6]
 [  8   1   2   0   0   0   2   0   1   5   0   2   0   4   6   5  32  20
  173  30]
 [ 78   2   0   1   1   0   3   0   3   1   2   0   0   2   2  19  29   1
   24 130]]
Classification Report :                            precision    recall  f1-score   support

             alt.atheism       0.66      0.73      0.69       274
           comp.graphics       0.75      0.80      0.77       301
 comp.os.ms-windows.misc       0.81      0.83      0.82       308
comp.sys.ibm.pc.hardware       0.80      0.87      0.83       284
   comp.sys.mac.hardware       0.84      0.87      0.85       297
          comp.windows.x       0.92      0.88      0.90       295
            misc.forsale       0.82      0.84      0.83       311
               rec.autos       0.90      0.90      0.90       324
         rec.motorcycles       0.89      0.93      0.91       306
      rec.sport.baseball       0.92      0.94      0.93       290
        rec.sport.hockey       0.95      0.92      0.93       283
               sci.crypt       0.95      0.91      0.93       283
         sci.electronics       0.91      0.84      0.87       313
                 sci.med       0.92      0.90      0.91       300
               sci.space       0.90      0.86      0.88       296
  soc.religion.christian       0.89      0.97      0.93       311
      talk.politics.guns       0.77      0.88      0.82       292
   talk.politics.mideast       0.91      0.85      0.88       343
      talk.politics.misc       0.64      0.59      0.62       291
      talk.religion.misc       0.58      0.44      0.50       298

                accuracy                           0.84      6000
               macro avg       0.84      0.84      0.84      6000
            weighted avg       0.84      0.84      0.84      6000