### Do Naive Bayes on Federalist Papers to identify author of 12 Disputed Federalist Papers
#### It's unlikely that Jay is an author of disputed papers, so build Naive Bayes model using only Hamilton and Madison papers
#### Breitzman 5/16/2023

In [1]:
import pickle
#get Federalist papers previously stored in GutenbergFederalistPickle.ipynb
with open('hamilton.pik', 'rb') as f:
    hamilton = pickle.load(f)
    
with open('madison.pik', 'rb') as f:
    madison = pickle.load(f)
    
with open('disputed.pik', 'rb') as f:
    disputed = pickle.load(f)

with open('joint.pik', 'rb') as f:
    joint = pickle.load(f)

In [2]:
from nltk.tokenize import word_tokenize


hamiltonDicts = [] #list of dictionaries containing word freq for each of Hamilton's Federalist Papers
madisonDicts = []
disputedDicts = []
jointDicts = []

def getDocDict(str1):
#returns a dictonary containing frequencies of any word in string
#e.g. str1 = 'quick brown fox is quick.'
# returns {quick:2, brown:1, fox:1, is:1}
  x = {}
  words = word_tokenize(str1.lower().strip())
  for b in words:
        if b in x:
            x[b]+=1
        else:
            x[b]=1
  return(x)

for a in hamilton:
    hamiltonDicts.append(getDocDict(a[1][0]))
    
for a in madison:
    madisonDicts.append(getDocDict(a[1][0]))
    
for a in joint:
    jointDicts.append(getDocDict(a[1][0]))
    
for a in disputed:
    disputedDicts.append(getDocDict(a[1][0]))
    
print(len(hamiltonDicts),len(madisonDicts),len(jointDicts),len(disputedDicts))

51 14 3 12


In [3]:
completeDict = {} #dictionary containing every word along with 
                  #documeny frequency 
                  #(e.g. # of fed papers containing word)

kills = [',','.',"''",'',';','-',')','(']
authDicts = [hamiltonDicts,madisonDicts,jointDicts,disputedDicts]
for authDict in authDicts:
  for a in authDict:
    for x in a:
        if (x not in kills):
         if x in completeDict:
            completeDict[x]+=1
         else:
            completeDict[x]=1
            
trimDict = set()  #subset of completeDict that contains useful words
for a in completeDict:
    x = completeDict[a]
    if (x >= 3 and x < 80):
        trimDict.add(a)
        
print(len(completeDict),len(trimDict))

8492 3967


In [4]:
#build Naive Bayes Dictionaries for Hamilton, Madison
hamiltonNBwordDicts = {}
madisonNBwordDicts = {}

hamiltonNBdenom = madisonNBdenom = 0

laplaceConst = 2
for a in trimDict: #this is equivalent to Laplace Smoothing
    hamiltonNBwordDicts[a] = madisonNBwordDicts[a] = laplaceConst
    hamiltonNBdenom += laplaceConst
    madisonNBdenom += laplaceConst
    
for dictionary in hamiltonDicts:
    for word in dictionary:
        if (word in trimDict):
              hamiltonNBwordDicts[word]+=dictionary[word]
              hamiltonNBdenom +=dictionary[word]
            
for dictionary in madisonDicts:
    for word in dictionary:
        if (word in trimDict):
              madisonNBwordDicts[word]+=dictionary[word]
              madisonNBdenom += dictionary[word]

    
print(hamiltonNBdenom,madisonNBdenom)

64314 27766


In [5]:
print(len(hamiltonNBwordDicts),len(madisonNBwordDicts),len(trimDict))

3967 3967 3967


In [6]:
for a in hamiltonNBwordDicts:
    if (hamiltonNBwordDicts[a]<=2 and madisonNBwordDicts[a]<=2):
        print(a,hamiltonNBwordDicts[a],madisonNBwordDicts[a])

watch 2 2
prompted 2 2
septennial 2 2
entering 2 2
abbe 2 2


In [7]:
hamiltonNBwordDicts['upon']

374

In [8]:
madisonNBwordDicts['upon']

9

In [9]:

    
print(hamiltonNBdenom,madisonNBdenom)

64314 27766


In [10]:
vocab = set()
for word in (trimDict):
      if (not (hamiltonNBwordDicts[word]/hamiltonNBdenom < .0002 and
          madisonNBwordDicts[word]/madisonNBdenom < .0002)):
          vocab.add(word)
      

print(len(trimDict),len(vocab))

3967 1452


In [11]:
import math
#given a document return 'hamilton' if NaiveBayes prob 
#suggests Hamilton authored it. similarly return 
#'madison' if he is the likely author
def NB_federalist_predict(docDict,vocab1=trimDict):
  h_pr = m_pr = 0
  for word in docDict:
    if (word in vocab1):
        h_pr += float(docDict[word])*(math.log(
           hamiltonNBwordDicts[word]/hamiltonNBdenom))
        m_pr += float(docDict[word])*(math.log(
           madisonNBwordDicts[word]/madisonNBdenom))
        
  if (h_pr > m_pr):
         return('hamilton')
  else:
         return('madison')
    
def check_accuracy(vocab1=trimDict):
    right = wrong = 0
    for a in hamiltonDicts:
        if NB_federalist_predict(a,vocab1)=='hamilton':
            right+=1
        else:
            wrong+=1

    for a in madisonDicts:
        if NB_federalist_predict(a,vocab1)=='madison':
            right+=1
        else:
            wrong+=1
    return([100*right/(right+wrong),right,wrong])
    
print('% correct:',check_accuracy()[0])

% correct: 100.0


In [12]:
check_accuracy()

[100.0, 65, 0]

In [13]:
interesting = []
tableData = []
j = 0
for i,a in enumerate(trimDict):
    h1 = hamiltonNBwordDicts[a]/hamiltonNBdenom
    m1 = madisonNBwordDicts[a]/madisonNBdenom
    if (m1/h1 > 5 or h1/m1 > 5):
      interesting.append(a)
      if (j < 225):
         tableData.append([a,m1/h1,h1/m1])
         j+=1
        
from tabulate import tabulate
print (tabulate(tableData, headers=["FavoriteWord","Madison Pr/Hamilton Pr", "Hamilton Pr/Madison Pr"]))

FavoriteWord      Madison Pr/Hamilton Pr    Hamilton Pr/Madison Pr
--------------  ------------------------  ------------------------
enough                         0.132359                  7.5552
compilers                      5.79072                   0.17269
reform                        10.4233                    0.095939
recommended                    6.17676                   0.161897
indebted                       5.79072                   0.17269
stamped                        6.94886                   0.143909
complied                       5.79072                   0.17269
novelty                        5.79072                   0.17269
administering                  6.94886                   0.143909
universally                    5.79072                   0.17269
pronounced                     5.40467                   0.185025
term                           7.87537                   0.126978
speedy                         5.79072                   0.17269
obviated        

In [14]:
small10 = ['upon','on','very','natural','concurrent']

In [15]:
small11 = ['upon','against','whilst','inhabitants','dismission','within']

In [16]:
smallVocab2 = ['obviated','patriotic','lesser','composing','although','whilst','consequently','nomination','enough','while','inconveniency','sphere','upon']

In [17]:

#the following checks accuracy on the training set and then
#identifies how many of the disputed papers are by each author
def Federalist_report(words=trimDict):
    if (len(words)<10):
        print(words)
    else:
        temp = words[:9]
        temp.append('...')
        print(temp)
    print(str(check_accuracy(words)[0])+'% accuracy')
    madison = hamilton = 0
    for a in disputedDicts:
        if (NB_federalist_predict(a,words)=='madison'):
            madison+=1
        else:
            hamilton+=1
    print("disputed papers: madison:"+str(madison)+
          ', hamilton:'+str(hamilton)+'\n')
    
Federalist_report(interesting)
Federalist_report(['although','composing','involves',
         'confederation','upon'])
Federalist_report(['although','obviated','composing',
         'whilst','consequently','upon'])
Federalist_report(['against','within','inhabitants',
         'whilst','powers','upon','while'])
Federalist_report(['against','upon','whilst',
         'inhabitants','within'])
Federalist_report(['against','within','inhabitants',
         'whilst','upon'])
Federalist_report(['against','while','whilst','upon','on'])
Federalist_report(['concurrent','upon','on',
         'very','natural'])
Federalist_report(['while','upon','on','inconveniency'])

['enough', 'compilers', 'reform', 'recommended', 'indebted', 'stamped', 'complied', 'novelty', 'administering', '...']
100.0% accuracy
disputed papers: madison:12, hamilton:0

['although', 'composing', 'involves', 'confederation', 'upon']
100.0% accuracy
disputed papers: madison:12, hamilton:0

['although', 'obviated', 'composing', 'whilst', 'consequently', 'upon']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['against', 'within', 'inhabitants', 'whilst', 'powers', 'upon', 'while']
100.0% accuracy
disputed papers: madison:12, hamilton:0

['against', 'upon', 'whilst', 'inhabitants', 'within']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['against', 'within', 'inhabitants', 'whilst', 'upon']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['against', 'while', 'whilst', 'upon', 'on']
96.92307692307692% accuracy
disputed papers: madison:12, hamilton:0

['concurrent', 'upon', 'on', 'very', 'natural']
98.46153846153847% acc