In [17]:
import numpy as np
import json
from sklearn.feature_extraction import text
x = open('fedpapers_split.txt').read()
papers = json.loads(x)    
papersH = papers[0] # papers by Hamilton 
papersM = papers[1] # papers by Madison
papersD = papers[2] # disputed papers
nH, nM, nD = len(papersH), len(papersM), len(papersD)
print("Total no. of papers: ",85)
print("By Hamilton: ",nH)
print("By Madison: ",nM)
print("Disputed : ",nD)
# This allows you to ignore certain common words in English
# You may want to experiment by choosing the second option or your own
# list of stop words, but be sure to keep 'HAMILTON' and 'MADISON' in
# this list at a minimum, as their names appear in the text of the papers
# and leaving them in could lead to unpredictable results
stop_words = text.ENGLISH_STOP_WORDS.union({'HAMILTON','MADISON'})
# stop_words = {'HAMILTON','MADISON'} # can comment out both and give..
# stop_words
## Form bag of words model using words used at least 10 times
vectorizer = text.CountVectorizer(stop_words=stop_words,min_df=10)
X = vectorizer.fit_transform(papersH+papersM+papersD).toarray()
# print(vectorizer)
# print(X)
# print(X.shape)
## using words used at least 10 times from all the documents
# Uncomment this line to see the full list of words remaining after filtering out 
# stop words and words used less than min_df times
# vectorizer.vocabulary_

Total no. of papers:  85
By Hamilton:  51
By Madison:  17
Disputed :  12


In [18]:
# Split word counts into separate matrices
XH, XM, XD = X[:nH,:], X[nH:nH+nM,:], X[nH+nM:,:]
print(XH.shape)
print(XM.shape)
print(XD.shape)
d = 1307 ##, the choices in the vocabulary, for Laplace smoothing
fH = [0] * d
fM = [0] * d
Pm = [0] * d
Ph = [0] * d

(51, 1307)
(17, 1307)
(12, 1307)


In [19]:
# Estimate probability of each word in vocabulary being used by Hamilton
column_sumsa = [sum([row[i] for row in XH]) for i in range(0,len(XH[0]))]
for i in range(0,len(column_sumsa)):
    fH[i] = ((column_sumsa[i])+1)/(sum(column_sumsa)+d) # to make sure there's no division by zero.
    
# Estimate probability of each word in vocabulary being used by Madison
column_sumsb = [sum([row[i] for row in XM]) for i in range(0,len(XM[0]))]
for i in range(0,len(column_sumsb)):
    fM[i] = ((column_sumsb[i])+1)/(sum(column_sumsb)+d) # to make sure there's no division by zero.
    
# Compute ratio of these probabilities
fH = np.array(fH)
fM = np.array(fM)
fratio = fH/fM
print(fratio.shape)
print(fH.shape)
print(fM.shape)

(1307,)
(1307,)
(1307,)


In [20]:
# Compute prior probabilities 
piH = 51/68
print("Hamilton prior probability: ", piH)
piM = 17/68 
print("Madison prior probability: ", piM)
H = 0
M = 0
l = 0
## Laplace smoothing done before to make sure there's no division by zero. 
for xd in XD: # Iterate over disputed documents
    # Compute likelihood ratio for Naive Bayes model
    for i in range(0,len(xd)):
        Ph[i] = fratio[i]**xd[i]
    Ph = np.array(Ph)
    PH = np.prod(Ph)
    LR = (PH*piH)/(piM) 
    l +=1
    print("Likelihood Ratio for disputed document ", l, ": ", LR)
    if LR>0.5:
        print('Hamilton')
        H +=1
    else:
        print('Madison')
        M +=1
print("")
print("**Final results: **")
print("No. of essays written by Hamilton, as predicted: ",H)
print("No. of essays written by Madison, as predicted: ",M)

Hamilton prior probability:  0.75
Madison prior probability:  0.25
Likelihood Ratio for disputed document  1 :  8.24076739239e-17
Madison
Likelihood Ratio for disputed document  2 :  2.28472868133e-13
Madison
Likelihood Ratio for disputed document  3 :  3.32498959998e-43
Madison
Likelihood Ratio for disputed document  4 :  6.38529052403e-14
Madison
Likelihood Ratio for disputed document  5 :  40256674340.6
Hamilton
Likelihood Ratio for disputed document  6 :  6939.36582376
Hamilton
Likelihood Ratio for disputed document  7 :  4.67013870929e-10
Madison
Likelihood Ratio for disputed document  8 :  3.9693293703e+17
Hamilton
Likelihood Ratio for disputed document  9 :  1380849.24311
Hamilton
Likelihood Ratio for disputed document  10 :  6.80424307554e-13
Madison
Likelihood Ratio for disputed document  11 :  3.02279231828e-25
Madison
Likelihood Ratio for disputed document  12 :  1.87544658847e-19
Madison

**Final results: **
No. of essays written by Hamilton, as predicted:  4
No. of essays 