### Do Naive Bayes on Federalist Papers to identify author of 12 Disputed Federalist Papers
#### It's unlikely that Jay is an author of disputed papers, so build Naive Bayes model using only Hamilton and Madison papers
#### Breitzman 5/16/2023

In [1]:
import pickle
#get Federalist papers previously stored in GutenbergFederalistPickle.ipynb
with open('hamilton.pik', 'rb') as f:
    hamilton = pickle.load(f)
    
with open('madison.pik', 'rb') as f:
    madison = pickle.load(f)
    
with open('disputed.pik', 'rb') as f:
    disputed = pickle.load(f)

with open('joint.pik', 'rb') as f:
    joint = pickle.load(f)

In [2]:
from nltk.tokenize import word_tokenize

kills = [',','.',"''",'',';','-',')','(']

completeDict = {}  #dictionary containing any word mentioned in any of the Federalist papers and the number of Federalist Papers containing word
hamiltonDicts = [] #list of dictionaries containing word freq for each of Hamilton's Federalist Papers
madisonDicts = []
disputedDicts = []
jointDicts = []

def getDocDict(str1):
#returns a dictonary containing frequencies of any word in string
#e.g. str1 = 'quick brown fox is quick.'
# returns {quick:2, brown:1, fox:1, is:1}
  x = {}
  words = word_tokenize(str1.lower().strip())
  for b in words:
        if b in x:
            x[b]+=1
        else:
            x[b]=1
  return(x)

for a in hamilton:
    hamiltonDicts.append(getDocDict(a[1][0]))
    
for a in madison:
    madisonDicts.append(getDocDict(a[1][0]))
    
for a in joint:
    jointDicts.append(getDocDict(a[1][0]))
    
for a in disputed:
    disputedDicts.append(getDocDict(a[1][0]))
    
print(len(hamiltonDicts),len(madisonDicts),len(jointDicts),len(disputedDicts))

51 14 3 12


In [3]:
completeDict = {} #dictionary containing every word along with 
                  #documeny frequency 
                  #(e.g. # of fed papers containing word)

kills = [',','.',"''",'',';','-',')','(']
authDicts = [hamiltonDicts,madisonDicts,jointDicts,disputedDicts]
for authDict in authDicts:
  for a in authDict:
    for x in a:
        if (x not in kills):
         if x in completeDict:
            completeDict[x]+=1
         else:
            completeDict[x]=1
            
trimDict = set()  #subset of completeDict that contains useful words
for a in completeDict:
    x = completeDict[a]
    if (x >= 3 and x < 80):
        trimDict.add(a)
        
print(len(completeDict),len(trimDict))

8492 3967


In [49]:
#build Naive Bayes Dictionaries for Hamilton, Madison
hamiltonNBwordDicts = {}
madisonNBwordDicts = {}

for a in trimDict: #this is equivalent to Laplace Smoothing
    hamiltonNBwordDicts[a] = madisonNBwordDicts[a] = 2
    
for dictionary in hamiltonDicts:
    for word in dictionary:
        if (word in trimDict):
              hamiltonNBwordDicts[word]+=dictionary[word]
            
for dictionary in madisonDicts:
    for word in dictionary:
        if (word in trimDict):
            if (word in madisonNBwordDicts):
              madisonNBwordDicts[word]+=dictionary[word]
            
hamiltonNBdenom = madisonNBdenom = 0
for x in hamiltonNBwordDicts:
    hamiltonNBdenom += hamiltonNBwordDicts[x]
for x in madisonNBwordDicts:
    madisonNBdenom += madisonNBwordDicts[x]  
    
print(hamiltonNBdenom,madisonNBdenom)

64314 27766


In [50]:
print(len(hamiltonNBwordDicts),len(madisonNBwordDicts),len(trimDict))

3967 3967 3967


In [51]:
for a in hamiltonNBwordDicts:
    if (hamiltonNBwordDicts[a]<=2 and madisonNBwordDicts[a]<=2):
        print(a,hamiltonNBwordDicts[a],madisonNBwordDicts[a])

watch 2 2
entering 2 2
septennial 2 2
prompted 2 2
abbe 2 2


In [52]:
hamiltonNBwordDicts['upon']

374

In [53]:
madisonNBwordDicts['upon']

9

In [54]:

    
print(hamiltonNBdenom,madisonNBdenom)

64314 27766


In [55]:
vocab = set()
for word in (trimDict):
      if (not (hamiltonNBwordDicts[word]/hamiltonNBdenom < .0002 and
          madisonNBwordDicts[word]/madisonNBdenom < .0002)):
          vocab.add(word)
      

print(len(trimDict),len(vocab))

3967 1452


In [56]:
import math
#given a document return 'hamilton' if NaiveBayes prob 
#suggests Hamilton authored it. similarly return 
#'madison' if he is the likely author
def NB_federalist_predict(docDict,vocab1=trimDict):
  h_pr = m_pr = 0
  for word in docDict:
    if (word in vocab1):
        h_pr += float(docDict[word])*(math.log(
           hamiltonNBwordDicts[word]/hamiltonNBdenom))
        m_pr += float(docDict[word])*(math.log(
           madisonNBwordDicts[word]/madisonNBdenom))
        
  if (h_pr > m_pr):
         return('hamilton')
  else:
         return('madison')
    
def check_accuracy(vocab1=trimDict):
    right = wrong = 0
    for a in hamiltonDicts:
        if NB_federalist_predict(a,vocab1)=='hamilton':
            right+=1
        else:
            wrong+=1

    for a in madisonDicts:
        if NB_federalist_predict(a,vocab1)=='madison':
            right+=1
        else:
            wrong+=1
    return([100*right/(right+wrong),right,wrong])
    
print('% correct:',check_accuracy()[0])

% correct: 100.0


In [57]:
check_accuracy()

[100.0, 65, 0]

In [58]:
interesting = []
for i,a in enumerate(trimDict):
    h1 = hamiltonNBwordDicts[a]/hamiltonNBdenom
    m1 = madisonNBwordDicts[a]/madisonNBdenom
    if (m1/h1 > 5 or h1/m1 > 5):
      print(a,h1/m1,m1/h1)
      interesting.append(a)

precision 0.17269023851727464 5.790715263271626
indebted 0.17269023851727464 5.790715263271626
involves 0.14390853209772886 6.9488583159259525
sphere 0.12951767888795596 7.72095368436217
residence 0.16189709860994494 6.1767629474897365
stamped 0.14390853209772886 6.9488583159259525
consequently 0.1850252555542228 5.404667579053519
kind 11.22486550362285 0.0890879271272558
lesser 0.19187804279697182 5.211643736944464
compilers 0.17269023851727464 5.790715263271626
slaves 0.17269023851727464 5.790715263271626
nomination 5.180707155518238 0.19302384210905427
composing 0.09962898375996612 10.03723978967082
courts 8.418649127717138 0.11878390283634108
patriotic 0.17269023851727464 5.790715263271626
democratic 0.17269023851727464 5.790715263271626
transferred 0.17269023851727464 5.790715263271626
intended 7.986923531423951 0.12520465434100816
community 7.663129334204061 0.1304949918483747
2. 0.17269023851727464 5.790715263271626
democracy 0.07195426604886443 13.897716631851905
pointing 0.172

In [59]:
kills2 = ['--','1.','2.']
interesting = []
for i,a in enumerate(trimDict):
    h1 = hamiltonNBwordDicts[a]/hamiltonNBdenom
    m1 = madisonNBwordDicts[a]/madisonNBdenom
    if (m1/h1 > 5 or m1/h1 <.2):
     if (a not in kills2):
      print(a,h1/m1,m1/h1)
      interesting.append(a)


precision 0.17269023851727464 5.790715263271626
indebted 0.17269023851727464 5.790715263271626
involves 0.14390853209772886 6.9488583159259525
sphere 0.12951767888795596 7.72095368436217
residence 0.16189709860994494 6.1767629474897365
stamped 0.14390853209772886 6.9488583159259525
consequently 0.1850252555542228 5.404667579053519
kind 11.22486550362285 0.0890879271272558
lesser 0.19187804279697182 5.211643736944464
compilers 0.17269023851727464 5.790715263271626
slaves 0.17269023851727464 5.790715263271626
nomination 5.180707155518238 0.19302384210905427
composing 0.09962898375996612 10.03723978967082
courts 8.418649127717138 0.11878390283634108
patriotic 0.17269023851727464 5.790715263271626
democratic 0.17269023851727464 5.790715263271626
transferred 0.17269023851727464 5.790715263271626
intended 7.986923531423951 0.12520465434100816
community 7.663129334204061 0.1304949918483747
democracy 0.07195426604886443 13.897716631851905
pointing 0.17269023851727464 5.790715263271626
chain 0.

In [60]:
small10 = ['upon','on','very','natural','concurrent']

In [61]:
small11 = ['upon','against','whilst','inhabitants','dismission','within']

In [62]:
smallVocab2 = ['obviated','patriotic','lesser','composing','although','whilst','consequently','nomination','enough','while','inconveniency','sphere','upon']

In [63]:

#the following checks accuracy on the training set and then
#identifies how many of the disputed papers are by each author
def report1(words=trimDict):
    if (len(words)<10):
        print(words)
    print(str(check_accuracy(words)[0])+'% accuracy')
    madison = hamilton = 0
    for a in disputedDicts:
        if (NB_federalist_predict(a,words)=='madison'):
            madison+=1
        else:
            hamilton+=1
    print("disputed papers: madison:"+str(madison)+
          ', hamilton:'+str(hamilton)+'\n')
    
report1(interesting)
report1(['although','composing','involves',
         'confederation','upon'])
report1(['although','obviated','composing',
         'whilst','consequently','upon'])
report1(['against','within','inhabitants',
         'whilst','powers','upon','while'])
report1(['against','upon','whilst',
         'inhabitants','within'])
report1(['against','within','inhabitants',
         'whilst','upon'])
report1(['against','while','whilst','upon','on'])
report1(['concurrent','upon','on',
         'very','natural'])
report1(['while','upon','on','inconveniency'])

100.0% accuracy
disputed papers: madison:12, hamilton:0

['although', 'composing', 'involves', 'confederation', 'upon']
100.0% accuracy
disputed papers: madison:12, hamilton:0

['although', 'obviated', 'composing', 'whilst', 'consequently', 'upon']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['against', 'within', 'inhabitants', 'whilst', 'powers', 'upon', 'while']
100.0% accuracy
disputed papers: madison:12, hamilton:0

['against', 'upon', 'whilst', 'inhabitants', 'within']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['against', 'within', 'inhabitants', 'whilst', 'upon']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['against', 'while', 'whilst', 'upon', 'on']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['concurrent', 'upon', 'on', 'very', 'natural']
98.46153846153847% accuracy
disputed papers: madison:12, hamilton:0

['while', 'upon', 'on', 'inconveniency']
95.38461538461539% accuracy
dis