# Imports

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import codecs
import string
import os
import re

# Extraction

In [2]:
def file_paths(folder_name):
    """ folder name: string representing the data folder name
        Y : list of string, stores newsgroup names, used as classes
        returns list of paths to the documents and Y
    """
    """      |<-----Absolute Path------>|        """
    prefix = "C:\\Users\\Acer\\Desktop\\project\\" #--------------------------> store (the absolute/relative path to the dir where the folder is) as prefix 
    path = prefix + folder_name #------------------------------------> path to the folder
    newsgroups = os.listdir(path) #----------------------------------> path list of newsgroups which has all the docs
    path_list = [] #-------------------------------------------------> stores the path list to docs (not seperated by newsgroups)
    Y = [] #---------------------------------------------------------> stores newsgroup name corresponding to path_list
    for news_group in newsgroups:
        files_path = path + "\\" + news_group #----------------------> path to the documents inside a newsgroup
        files = os.listdir(files_path) #-----------------------------> retrieving the documents inside the newsgroup
        Y += ([news_group]*len(files)) #-----------------------------> all the class name would be same for every docs inside a newsgroup
        for file in files:
            path_list.append(files_path+"\\"+file) #-----------------> appends path to a single doc to the path_list
    return path_list, Y #--------------------------------------------> (paths to data docs, output_classes)
    
    
def remove_metadata(lines_list):
    """ seperates the file into lists of lines
        filters out the lines having ['Xref','Path','Date'] headers as its not important
        (I should have filtered out the 'From' header too, but I thought the sender name might help classifying the data)
        returns the filtered list of lines
    """
    new_lines = [] #-------------------------------------------------> list of lines without unimportant headers
    for lines in lines_list:
        lines = lines.split('\n')
        for line in lines:
            if line[:4] in ['Xref','Path','Date']:
                continue #-------------------------------------------> skipping certain headers
            new_lines.append(line)
            
    return new_lines #-----------------------------------------------> filtered list of lines


def extract_words(line):
    """ splits the word with separators ["-", ',', ' ', '.', '@', '\t'] then cleans and processes the words
        line : string -> represents particular line of the current doc
    """
    words = re.split(r'[-,\s.@\t]\s*', line.strip()) #---------------> split operation
    words = preprocess(words) #--------------------------------------> processing
    words = remove_stopwords(words) #--------------------------------> removing unimportant words
    
    return words #---------------------------------------------------> list of cleaned words


def read_text(path):
    """ reads the text of a single document and converts the text as list of words
        path: string represents the path to a particular document
    """
    f = open(path, 'r') 
    text_lines = f.readlines() #------------------------------------> load document as a list of lines
    text_lines = remove_metadata(text_lines) #----------------------> removing the meta-data at the top of each document
    doc_words = [] # -----------------------------------------------> initiazing an array to hold all the words in a document
    """traverse over all the lines and tokenize each one with the help of helper function: extract_words"""
    for line in text_lines:
        words = (extract_words(line)) #-----------------------------> stores the cleaned and processed words from line
        if words != ['']:
            doc_words += words #------------------------------------> adding list of words into the total collection without increasing the dimension
    return doc_words 


def create_XY(folder_name):
    """ retrieves X, Y from the given folder"""
    paths,Y = file_paths(folder_name) #-----------------------------> storing the path lists to singular docs and respective class name
    X = []
    for path in paths:
        X.append(read_text(path))
    return X,Y

# Processing

### stopwords

In [3]:
stopwords = ['', 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
 'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
 'each', 'few', 'for', 'from', 'further',
 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
 "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
 'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
 "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
 "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
 '4th', '5th', '6th', '7th', '8th', '9th', '10th','subject', 'lines', 'newsgroups','sender', 'like','just', 'know', 'get', 'think',
 'well', 'now', 'even', 'see', 'way', 'say', 'world', 'make', 'many', 'much', 'right', 'want', 'anyone', 'reply', 'said', 'used',
 'need',]

In [4]:
def preprocess(words):
#     table = str.maketrans('', '', '\t')
#     words = [word.translate(table) for word in words] 
    """ the character: ' appears in a lot of stopwords and changes meaning of words if removed
        hence it is removed from the list of symbols that are to be discarded from the documents
    """
    punctuations = (string.punctuation).replace("'", "")
    trans_table = str.maketrans('', '', punctuations) #-------------------------> mapping of punctuations to ""
    stripped_words = [word.translate(trans_table) for word in words] #----------> removing puntuations
    
    words = [str for str in stripped_words if str] #----------> removing empty strings
    
    """ some words are quoted in the documents & as we have not removed ' to maintain the integrity of some stopwords
        we try to unquote such words below 
    """
    p_words = []
    for word in words:
        if (word[0] and (word[len(word)-1] == "'" or word[len(word)-1] == '"')):
            word = word[1:len(word)-1]
        elif(word[0] == "'" or word[0] == '"'):
            word = word[1:len(word)]
        else:
            word = word
        p_words.append(word)
        
    """remove just-numeric strings as they do not have any significant meaning in text classification"""
    words = [word for word in words if not word.isdigit()]
    
    words = [word.lower() for word in words if len(word) > 2] #------------------> remove words with only 2 characters and transform to lowercase
    return words
    

def remove_stopwords(words):
    return [word for word in words if word not in stopwords] #-------------------> just removes the stopwords


def flatten(_2d_list):
    """2d to 1d converter"""
    _1d_list = []
    for ele in _2d_list:
        _1d_list += ele
    return _1d_list


def first_n_most_freq(words, n):
    """ function two select words as features based on there frequency """
    np_list_of_words = np.array(flatten(words)) #--------------------------------> 1d collection of all words
    words, counts = np.unique(np_list_of_words, return_counts=True) #------------> stores unique words along with their count
    freq, features = (list(i) for i in zip(*(sorted(zip(counts, words), reverse=True)))) #sorting the unique words according to their frequency
    """ I had printed the frequent words and selected unnecessary words manually from the output
        appended those words to stopwords and ran the code again
        
    freq_unnecessary_words = []
    for i in range(len(freq)):
        if(freq[i]>2000):
            freq_unnecessary_words.append(features[i])
    print(freq_unnecessary_words)
    """
    return features[:n]

# Data

### Transform X {list(list(string)) --> list(list(int))}

In [5]:
def X_transform(X, n):
    """ this function stores counts of words inside a doc(numeric representation) in a dictionary 
        returns a call to convert_words_to_num
        total transformation => a list of (rows having w1,w2,...w*t* words) to a list of (rows having counts of *n* feature words)
    """
    dictionary = {} #---------------------------------------> the dictionary which holds counts of words per document
    doc_num = 1 #-------------------------------------------> numeric key representing a doc
    for doc_words in X:
        #print(doc_words)
        np_doc_words = np.asarray(doc_words)
        w, c = np.unique(np_doc_words, return_counts=True) #> unique words present inside a doc along with its count
        dictionary[doc_num] = {} #--------------------------> creating words: counts dictionary inside the doc key
        for i in range(len(w)):
            dictionary[doc_num][w[i]] = c[i] #--------------> storing the count of that word
        doc_num = doc_num + 1 #-----------------------------> next numeric key representing next doc
    return convert_words_to_num(X, dictionary, n)


def convert_words_to_num(X, dictionary, n):
    X_data = [] #---------------------------------------------> stores transformed X
    features = first_n_most_freq(X, n) #----------------------> creating features with n most frequent words
    for k in dictionary.keys():
        row = [] #--------------------------------------------> stores the counts of every feature-words for a given doc
        for f in features:
            if(f in dictionary[k].keys()):
                """ if word f is present in the dictionary of the document as a key, its value is copied
                    this gives us no. of occurences """
                row.append(dictionary[k][f]) 
            else:
                row.append(0) #-------------------------------> if not present, the no. of occurences is zero
        X_data.append(row)
    return X_data

In [6]:
X, Y = create_XY("20_newsgroups")
X = X_transform(X, 5000)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=0)
print(len(X_train),len(X_test))

14997 5000


# Training

### Model

In [7]:
class MultiNB:
    
    def __init__(self):
        self.class_prob = None
        self.prior = None
        self.likelihood = None
    
    
    def convert(self, Y, classes):
        """ converts Y to list(int) """
        classes = list(classes)
        Y_num = []
        for y in Y:
            Y_num.append(classes.index(y)) #--------------------------------> inplace of string, it stores the index of that string in self.classes
        return Y_num
        
        
    def fit(self, X_train, Y_train):
        """ I have used list instead of dict datatype to store my model to decrease look-up time and space complexity,
            the probablities(p(y=ai / X=x) and p(y=ai)) are solely based on training data
            so, instead of storing the counts, I have directly stored the probablities as prior(stores p(y=ai)) & likelihood(stores p(y=ai / Wj=wj))
        """
        n_docs, n_features = X_train.shape 
        classes = np.unique(Y_train) #--------------------------------------> names of classes
        n_classes = len(classes)
        prior = np.zeros(n_classes) #---------------------------------------> initializing prior with zeroes,stores log(p(y=ai))
        likelihood = np.zeros((n_features, n_classes)) #--------------------> initializing likelihood with zeroes, stores log(p(y=ai/ Wj= wj))
        no_docs_per_class = np.zeros(n_classes) #---------------------------> stores the num of docs per class
        no_word_i_per_class = np.zeros((n_classes, n_features)) #-----------> stores the num of occurrence of word_i per class
        Y_num = self.convert(Y_train,classes)
        
        for i in range(n_docs):
            class_i = Y_num[i]
            no_docs_per_class[class_i] += 1 #-------------------------------> increase the doc count of class_i
            for j in range(n_features):
                no_word_i_per_class[class_i][j] += X_train[i][j] # ---------> increase word_j count class_i
        
        for i in range(n_classes):
            prior[i] = np.log(no_docs_per_class[i]/n_docs) #----------------> p(y=ai) = (num of class_i occurrence)/(total num of docs)
            
        for i in range(n_features):
            for j in range(n_classes):
                numerator = (no_word_i_per_class[j][i]+1) #-----------------> n_wi in class_j + 1(laplace correction)
                denominator = (no_word_i_per_class[j].sum() + n_features) #-> total num of words in class_j + num of features(laplace correction)
                likelihood[i][j]= np.log(numerator/denominator) #-----------> log probablity of class_j given word_i
        
        """ saving trained datasets """
        self.prior = prior
        self.likelihood = likelihood
        self.n_features = n_features
        self.n_docs = n_docs
        self.n_classes = n_classes
        self.classes = classes
        
    
    def probablity(self, x, class_i):
        """ function to calculate probablity of a class given X=x"""
        output = 0
        for j in range(self.n_features):
            if x[j]==0:
                continue #-------------------------------------------------> skip the word that is not in current doc
            output += self.prior[class_i] + self.likelihood[j][class_i] #--> log(p(y=ai)*p(y=ai/Wj=wj)) = log(p(y=ai)) - log(p(y=ai/ Wj= wj))
        return output

    
    def predict_single_point(self, x):
        best_p = -100000 #-------------------------------------------------> stores best log probablity
        best_class = -1 #--------------------------------------------------> stores the best class index
        for class_i in range(self.n_classes):
            p_class_i = self.probablity(x, class_i)
            if(p_class_i > best_p):
                best_p = p_class_i #---------------------------------------> update the current best score
                best_class = class_i #-------------------------------------> update the current best class index

        return self.classes[best_class] #----------------------------------> return the class name using best class index

    
    def predict(self, x_test):
        """ make prediction for the given dataset using trained model"""
        Y_pred = []
        for x in x_test:
            y_predicted = self.predict_single_point(x) #-------------------> predict for a single doc
            Y_pred.append(y_predicted) #-----------------------------------> append the results

        #print(Y_pred)
        return Y_pred
    
    
    def accuracy(self, Y_pred, Y_true):
        n = len(Y_true) #--------------------------------------------------> total points
        m = 0 #------------------------------------------------------------> accurate points
        for i in range(n):
            if Y_pred[i]==Y_true[i]:
                m += 1
        return m/n #-------------------------------------------------------> accuracy = accurate points/total points

In [8]:
my_clf = MultiNB()
my_clf.fit(np.array(X_train), Y_train)

## Testing

In [9]:
my_prediction = my_clf.predict(X_test)
my_clf.accuracy(my_prediction,Y_test)

0.8964

In [13]:
std_clf = MultinomialNB()
std_clf.fit(X_train, Y_train)
std_prediction = std_clf.predict(X_test)
std_clf.score(X_test, Y_test)

0.8842

In [14]:
print("------------\nMy model\n--------------\n\nclassification report:")
print(classification_report(Y_test, my_prediction))
print("\nconfusion_matrix\n")
print(confusion_matrix(Y_test, my_prediction))

print("\n\n------------\nStandard model\n--------------\n\nclassification report:")
print(classification_report(Y_test, std_prediction))
print("\nconfusion_matrix\n")
print(confusion_matrix(Y_test, std_prediction))

------------
My model
--------------

classification report:
                          precision    recall  f1-score   support

             alt.atheism       0.78      0.85      0.81       233
           comp.graphics       0.81      0.87      0.84       253
 comp.os.ms-windows.misc       0.90      0.79      0.84       249
comp.sys.ibm.pc.hardware       0.77      0.90      0.83       240
   comp.sys.mac.hardware       0.89      0.90      0.90       236
          comp.windows.x       0.88      0.85      0.86       240
            misc.forsale       0.90      0.90      0.90       261
               rec.autos       0.97      0.95      0.96       269
         rec.motorcycles       0.99      0.96      0.98       284
      rec.sport.baseball       0.99      0.97      0.98       248
        rec.sport.hockey       0.97      1.00      0.98       231
               sci.crypt       0.97      0.98      0.97       233
         sci.electronics       0.93      0.92      0.93       244
              