## Import libraries

In [1]:
import numpy as np
import string
from sklearn.metrics import classification_report

## Preprocessing

In [2]:
def preProcess(fileName):
    op = []
    flag = 0
    with open(fileName) as f: #open the .txt files
        for i in f.readlines():
            #specify when to start extracting the text
            if i[:9] == "*** START":
                flag = 1
                continue
            #specify when to stop extracting the text
            if i[:7] == "*** END":
                break
            #extract the text
            if flag == 1 and i != '\n':
                op.append(i[:-1].lower())
    op = op[1:-2]
    return op

In [3]:
#preprocess both the files
sh = preProcess('pg1661.txt')
ja = preProcess('pg31100.txt')

In [4]:
#randomize the dataset
rnd = np.random.RandomState(42)
rnd.shuffle(sh)
rnd.shuffle(ja)

In [5]:
#set testSize to 1000 for both the files
testSize = 1000

In [6]:
#split the data into training and testing set
trainSH = sh[:-testSize]
testSH = sh[-testSize:]
trainJA = ja[:-testSize]
testJA = ja[-testSize:]

## Modelling

In [7]:
class NaiveBayes:
    #Initialization
    def __init__(self):
        #store a list of stopwords
        self.stopWords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
        #set other params to None
        self.words1 = None
        self.words2 = None
        self.count1 = None
        self.count2 = None
        self.prob1 = None
        self.prob2 = None
        self.alpha = None
    
    #Fitting the model
    def fit(self, c1, c2, alpha = 1):
        self.alpha = alpha #stores the smoothing factor
        self.words1 = self.__getWords__(c1) #gets a list of all words belonging to class1
        self.words2 = self.__getWords__(c2) #gets a list of all words belonging to class2
        self.count1 = self.__getCount__(self.words1) #gets count of occurance of all words in class1
        self.count2 = self.__getCount__(self.words2) #gets count of occurance of all words in class1
        self.prob1 = self.__getProb__(self.count1) #gets probability of each word in class1
        self.prob2 = self.__getProb__(self.count2) #gets probability of each word in class1
    
    #Predicting method
    def predict(self, sentences):
        self.pred = []
        for sentence in sentences:
            words = self.__clean__(sentence)
            p1, p2 = 0, 0 #initialize probabilities to 0
            for word in words:
                #get probability if the word is present in the training set
                if word in self.prob1:
                    p1 += np.log(self.prob1[word])
                if word in self.prob2:
                    p2 += np.log(self.prob2[word])
                #if the word is not present in the training set, add the smoothing factor
                if word not in self.prob1:
                    p1 += np.log(self.alpha / (len(self.prob1)))
                if word not in self.prob2:
                    p2 += np.log(self.alpha / (len(self.prob2)))
            if p1 > p2:
                self.pred.append(1)
            else:
                self.pred.append(0)
        return self.pred
    
    #Helper clean method that removes stopwords and punctuation
    def __clean__(self, sentence):
        words = []
        sentence = sentence.lower()
        for word in sentence.split():
            for pun in string.punctuation:
                word = word.replace(pun, "")
            if word not in self.stopWords:
                words.append(word)
        return words
    
    #Converts sentences into words
    def __getWords__(self, sentences):
        words = []
        for sentence in sentences:
            words += self.__clean__(sentence)
        return words
    
    #returns the count of each word
    def __getCount__(self, words):
        count = {}
        for word in words:
            if word not in count:
                count[word] = 0
            count[word] += 1
        return count
    
    #returns the probability of each word
    def __getProb__(self, count):
        prob = {}
        n = len(count)
        for i, v in count.items():
            prob[i] = (v + self.alpha) / n
        return prob

## Working

In [8]:
model = NaiveBayes()

In [9]:
model.fit(trainSH, trainJA)

In [10]:
testData = testSH + testJA

In [11]:
pred = model.predict(testData)

In [12]:
#Get labels for the test data
flag = 1
labels = []
for i in range(len(testData)):
    if i == len(testData) / 2:
        flag = 0
    labels.append(flag)

## Checking the accuracy of the model

In [13]:
print(classification_report(labels, pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75      1000
           1       0.97      0.36      0.53      1000

    accuracy                           0.68      2000
   macro avg       0.79      0.68      0.64      2000
weighted avg       0.79      0.68      0.64      2000

