In [1]:
import numpy as np
import pandas as pd
import codecs
from nltk.corpus import stopwords
import string

## Getting the data
1. Iterating through all the files and getting all the words of the file
2. Removing all the stop words
3. Getting an array which contains 20000 arrays consisting of words from each document

In [2]:
#save stop words
def getStopWords() :
    import string
    stop = stopwords.words('english')
    punctuations = list(string.punctuation)
    stop = stop + punctuations # adding punctuations to stopwords
    return stop

In [3]:
#Cleaning the data
def cleanWords(words) :
    stop_words = getStopWords()
    clean = []
    for w in words :
        w2 = w.lower() #converting to lowercase
        if(len(w2) < 3 or w2 in stop_words) :
            continue
        clean.append(w2)
    return clean

In [4]:
# get all features(words) from a file
def readfile(path) :
    with codecs.open(path, "r",encoding='utf-8', errors='ignore') as f:
        content =f.read() #reading contents of the file
    replacements = list(string.punctuation)
    for r in replacements:
        content = content.replace(r, ' ') #replacing punctuations with spaces
    words = content.strip().split(" ") #splitting the words
    clean_words = cleanWords(words) #getting cleanwords
    return clean_words

In [5]:
def get_data() :
    import os
    documents = []
    folders = os.listdir("20_newsgroups") #gettsing folder
    count = 0
    folder_counter = 0
    file_counter = 0
    for current_folder in folders :
        if(current_folder == '.DS_Store') :
            continue
        folder_path = "20_newsgroups/" + str(current_folder) #getting folder path
        files = os.listdir(folder_path)
        for current_file in files :
            if(current_file == '.DS_Store') :
                continue
            file_path = folder_path + '/' + str(current_file) #getting file path
            words = readfile(file_path)
            file_counter = file_counter + 1
            print(file_counter)
            documents.append((words, folder_counter))
        folder_counter += 1
    return documents

In [6]:
data = get_data() #function to get data 

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/anmolbudhiraja/nltk_data'
    - '/Users/anmolbudhiraja/opt/anaconda3/nltk_data'
    - '/Users/anmolbudhiraja/opt/anaconda3/share/nltk_data'
    - '/Users/anmolbudhiraja/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
data[0]

## Forming the dictionary
1. Using Counter to get the frequency of all words
2. Taking the top 1000 features using Counter also

In [None]:
def getfreq(words) :
    dictionary = {}
    for w in words :
        if(w in dictionary) :
            dictionary[w] = dictionary[w] + 1
        else :
            dictionary[w] = 1
    return dictionary

In [None]:
def get_features(data,n) :
    all_words = []
    for words,cat in data :
        all_words += words
    from collections import Counter
    features = Counter(all_words).most_common(n)
    return features

In [None]:
n = 1000
features = get_features(data,n)

In [None]:
features

In [None]:
def getRow(features,words) :
    row = np.zeros(len(features))
    frequency = getfreq(words)
    j = 0
    for f,c in features :
        if f in frequency :
            row[j] = frequency[f]
        j = j + 1
    return row

In [None]:
#Converting the dictionary into the required type of data for splitting by passing each row and getting the list with count of each feature
X = np.empty((0,n))
Y = np.array([])
count = 0
for words,category in data :
    count = count + 1
    print(count)
    row = getRow(features,words)
    X = np.append(X,row.reshape(1,-1),axis = 0)
    Y = np.append(Y,category)
X[0],Y

In [None]:
for i in X[0] :
    print(i)
# Y = data[ : , -1]

In [None]:
# Getting train and test data using model_selection
from sklearn import model_selection
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,random_state=1)

## Creating the dictionary to implement Naive Bayes
1. Storing the total count of each class
2. Storing the total_documents
3. Storing the count of feature in each document
4. Applying formula of Naive Bayes

In [None]:
#Fit function to fit the values
def fit(X,Y) :
    result = {}
    classes = set(Y)
    result["total_documents"] = X.shape[0]
    for current_class in classes :
        result[current_class] = {}
        number_rows = (Y == current_class)
        X_current = X[number_rows]
        Y_current = Y[number_rows]
        all_words = X.shape[1]
        result[current_class]["docs_with_current_class"] = len(X_current)
        result[current_class]["total_words"] = X_current.sum()
        for j in range(all_words) :
            result[current_class][j] = {}
            distinct_vals=set(X[:,j])
            for k in distinct_vals:
                result[current_class][j][k]=(X_current[:,j]==k).sum()
    return result

## Predicting the values

In [None]:
# Defining the probability calculating function
def probability(x,dictionary,current_class) :
    prob = np.log(dictionary[current_class]["docs_with_current_class"]) - np.log(dictionary["total_documents"])
    for j in range(len(x)) :
            if x[j] != 0 :
                prob += np.log(dictionary[current_class][j][x[j]] + 1) - np.log(dictionary[current_class]["docs_with_current_class"] + len(result[current_class][j].keys()))
    return prob

In [None]:
# Passing each point in this function to finding the best Class for each point
def predictSinglePoint(x,dictionary) :
    all_classes = dictionary.keys()
    best_class = -1
    pest_prob = -1000
    first_run = True
    for current_class in all_classes:
        if current_class == "total_documents" :
            continue
        p_current_class = probability(x,dictionary,current_class)
        if first_run or p_current_class > best_p :
            best_class =current_class
            best_p =p_current_class
        first_run = False
    return best_class

In [None]:
dictionary = fit(X_train,Y_train)
Y_pred = predict(X_test,dictionary)

In [None]:
#Predicting the values
def predict(X_test,dictionary) :
    Y_pred = []
    count = 0
    for x in X_test :
        print(count)
        count=count+1
        y = predictSinglePoint(x,dictionary)
        Y_pred.append(y)
    return Y_pred

In [None]:
# Converting float to int values
Y_pred_new = [int(i) for i in Y_pred]
Y_test_new = [int(i) for i in Y_test]

In [None]:
for i in range(len(Y_test)) :
    print(Y_test_new[i] , Y_pred_new[i])

In [None]:
dictionary

In [None]:
# importing metrics to print the report
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test_new, Y_pred_new))
print(confusion_matrix(Y_test_new,Y_pred_new))

## Checking with Inbuilt Multinomial NaiveBayes Algorithm

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))