## Breitzman 5/14/2023
### Run a bunch of models on Madison and Hamilton Federalist Papers to Predict Disputed Papers


In [1]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

In [2]:
import pickle
with open('hamilton.pik', 'rb') as f:
    hamilton = pickle.load(f)
    
with open('madison.pik', 'rb') as f:
    madison = pickle.load(f)   
    
with open('disputed.pik', 'rb') as f:
    disputed = pickle.load(f)

In [3]:
train = []
test = []

for i in range(len(hamilton)):
    x = hamilton[i]
    train.append([x[0],'h',x[1][0].strip().lower()])

for i in range(len(madison)):
    x = madison[i]
    train.append([x[0],'m',x[1][0].strip().lower()])    
    
for i in range(len(disputed)):
    x = disputed[i]
    test.append([x[0],x[1][0].strip().lower()]) 


In [4]:
print(len(train),len(test))

65 12


In [5]:
import pandas as pd
train_df = pd.DataFrame(train, columns=['id', 'author', 'text'])
train_df['id'] = pd.to_numeric(train_df['id'])


In [6]:
X = train_df['text']
y = train_df['author']

In [7]:
test_df = pd.DataFrame(test, columns=['id', 'text'])
test_df['id'] = pd.to_numeric(test_df['id'])


In [8]:
test_df['text'] = test_df['text'].str.lower().str.strip()
X_test = test_df['text']

In [9]:
def mPercent(results):
 mcount = 0
 tcount = 0
 for a in results:
    if (a == 'm'):
        mcount+=1
    tcount+=1
 print('% Disputed attributed to Madison:',100.0*mcount/tcount)
 print()

In [10]:
"""
Build and test multiple models via SKlearn.
X is a dataframe consisting of known Hamilton and Madison papers.
y is a data frameconsisting of author labels.
X_test is a dataframe consisting of disputed
papers
"""

smallVocab5 = ['against','within','inhabitants','whilst','upon']
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(analyzer="word",
                                                        binary=False,
                                                        min_df=2,
                                                        vocabulary=smallVocab5)

X_transformed = tfidf.fit_transform(X)
lb = sklearn.preprocessing.LabelEncoder()
y_transformed = lb.fit_transform(y)
X_test_transformed = tfidf.transform(X_test)

models = [
  KNeighborsClassifier(3),
  DecisionTreeClassifier(max_depth=5),
  RandomForestClassifier(n_estimators=25,max_depth=3),
  LinearSVC(),
  SVC(gamma=2, C=1),
  ComplementNB(),
  AdaBoostClassifier()
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X_transformed,y_transformed,scoring='accuracy',cv=CV)
  avgAccur = 0
  for fold_idx, accuracy in enumerate(accuracies):
    print(model_name,"fold:",fold_idx,"accuracy:",str(accuracy)[:5])
  print(model_name,"avg accuracy:",str(accuracies.mean())[:5])
  model.fit(X_transformed, y_transformed)
  y_final_predicted = model.predict(X_test_transformed)
  y_final_predicted_labeled =lb.inverse_transform(y_final_predicted)
  mPercent(y_final_predicted_labeled)


KNeighborsClassifier fold: 0 accuracy: 1.0
KNeighborsClassifier fold: 1 accuracy: 1.0
KNeighborsClassifier fold: 2 accuracy: 1.0
KNeighborsClassifier fold: 3 accuracy: 1.0
KNeighborsClassifier fold: 4 accuracy: 1.0
KNeighborsClassifier avg accuracy: 1.0
% Disputed attributed to Madison: 100.0

DecisionTreeClassifier fold: 0 accuracy: 1.0
DecisionTreeClassifier fold: 1 accuracy: 0.846
DecisionTreeClassifier fold: 2 accuracy: 1.0
DecisionTreeClassifier fold: 3 accuracy: 1.0
DecisionTreeClassifier fold: 4 accuracy: 1.0
DecisionTreeClassifier avg accuracy: 0.969
% Disputed attributed to Madison: 100.0

RandomForestClassifier fold: 0 accuracy: 1.0
RandomForestClassifier fold: 1 accuracy: 0.846
RandomForestClassifier fold: 2 accuracy: 1.0
RandomForestClassifier fold: 3 accuracy: 1.0
RandomForestClassifier fold: 4 accuracy: 1.0
RandomForestClassifier avg accuracy: 0.969
% Disputed attributed to Madison: 100.0

LinearSVC fold: 0 accuracy: 1.0
LinearSVC fold: 1 accuracy: 1.0
LinearSVC fold: 2 a