In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import process_words
import word_reporting
from naive_bayes import naiveBayes
from knn import kNearestNeighbors as knn
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataframe = pd.read_csv('statements.csv', encoding='utf-8')
# drop id column    
dataframe = dataframe.drop('id', axis=1)
# Move the speaker column to the end of the dataframe
dataframe = dataframe[[c for c in dataframe if c not in ['speaker']] + ['speaker']]
# filter and lemmatize the text column
dataframe = process_words.filter_and_lemmatize_df(dataframe)
dataframe.head()

Unnamed: 0,text,speaker
0,recession marvins competitor closed dozen plan...,obama
1,rise fall journey one nation one people,obama
2,difference difference personal,obama
3,thats weve excluded lobbyist policymaking job ...,obama
4,recognize many still strong desire focus past,obama


In [3]:
reporting_df = word_reporting.create_reporting_df(dataframe, class_col='speaker', class_1='trump', class_2='obama')
reporting_df.head(50)

Unnamed: 0,word,count,trump_count,obama_count,phi
0,abedini,1,0,1,-0.5
1,ability,3,1,2,-0.15
2,able,7,3,4,-0.063636
3,abroad,2,0,2,-0.5
4,absolute,2,2,0,0.5
5,absolutely,1,1,0,0.5
6,absorbing,1,0,1,-0.5
7,accommodate,1,0,1,-0.5
8,accompanied,1,0,1,-0.5
9,according,2,1,1,0.0


In [4]:
# Sort by phi coefficient and show the top 10
reporting_df.sort_values(by='phi', ascending=False).head(10)

Unnamed: 0,word,count,trump_count,obama_count,phi
835,firefighter,2,2,0,0.5
2041,society,2,2,0,0.5
831,financials,1,1,0,0.5
833,finish,3,3,0,0.5
2054,somewhat,1,1,0,0.5
840,five,4,4,0,0.5
2051,someplace,2,2,0,0.5
843,flaw,1,1,0,0.5
2048,someday,4,4,0,0.5
846,florida,7,7,0,0.5


In [5]:
# Show the bottom 10
reporting_df.sort_values(by='phi', ascending=True).head(10)

Unnamed: 0,word,count,trump_count,obama_count,phi
0,abedini,1,0,1,-0.5
1383,message,1,0,1,-0.5
1381,mere,1,0,1,-0.5
1380,mercury,1,0,1,-0.5
1379,mentioned,1,0,1,-0.5
1376,men,4,0,4,-0.5
1375,memory,2,0,2,-0.5
1374,memo,1,0,1,-0.5
1373,member,2,0,2,-0.5
1372,melt,1,0,1,-0.5


In [6]:
# Get the trump_count for the word 'member'
int(reporting_df[reporting_df['word'] == 'member']['trump_count'])

0

In [7]:
# Seperate data into training and test sets
# Create a two lists, one of 80% of the indexes and one of 20% of the indexes. Randomized
indexes = np.arange(len(dataframe))
np.random.shuffle(indexes)
training_indexes = indexes[:int(len(indexes) * .8)]
test_indexes = indexes[int(len(indexes) * .8):]
# Create the training and test dataframes
training_df = dataframe.iloc[training_indexes]
test_df = dataframe.iloc[test_indexes]


In [8]:
training_df.head()

Unnamed: 0,text,speaker
670,way iraq second largest oil reserve anywhere w...,trump
238,meanwhile strengthened defense hardening targe...,obama
948,another one morning amazing day phenomenal result,trump
69,still believe replace acrimony civility gridlo...,obama
293,kept counterattacking wanted get two buddy,obama


In [9]:
test_df.head()

Unnamed: 0,text,speaker
735,going iraq beginning beginning,trump
454,program like unemploymentinsurance student loa...,obama
239,need take challenge face nation make,obama
551,know one reason didnt bomb oil obama didnt wan...,trump
536,people inside massive convention center incred...,trump


In [10]:
model = naiveBayes()

In [11]:
model.train(training_df=dataframe, text_col='text', class_col='speaker')

In [12]:
predictions = model.predict(test_df=dataframe, text_col='text')

In [13]:
# Append predictions to the test_df
dataframe['prediction'] = predictions
dataframe.head()

Unnamed: 0,text,speaker,prediction
0,recession marvins competitor closed dozen plan...,obama,obama
1,rise fall journey one nation one people,obama,obama
2,difference difference personal,obama,obama
3,thats weve excluded lobbyist policymaking job ...,obama,obama
4,recognize many still strong desire focus past,obama,obama


In [14]:
model.accuracy(predictions=dataframe['prediction'], actual=dataframe['speaker'])

0.987

In [15]:
# Find number of words in the model vocabulary to find total number of features
len(model.vocab)

2497

In [16]:
knn_model = knn(k=5)

In [17]:
knn_model.train_on_df(df=dataframe, text_col='text', class_col='speaker', vectorizer='count')

{}
Counter({'recession': 1, 'marvins': 1, 'competitor': 1, 'closed': 1, 'dozen': 1, 'plant': 1, 'let': 1, 'hundred': 1, 'worker': 1, 'go': 1})
Counter({'one': 2, 'rise': 1, 'fall': 1, 'journey': 1, 'nation': 1, 'people': 1})
Counter({'difference': 2, 'personal': 1})
Counter({'thats': 1, 'weve': 1, 'excluded': 1, 'lobbyist': 1, 'policymaking': 1, 'job': 1, 'seat': 1, 'federal': 1, 'board': 1, 'commission': 1})
Counter({'recognize': 1, 'many': 1, 'still': 1, 'strong': 1, 'desire': 1, 'focus': 1, 'past': 1})
Counter({'many': 2, 'used': 1, 'past': 1, 'president': 1, 'republican': 1, 'democrat': 1, 'decade': 1})
Counter({'al': 1, 'qaeda': 1, 'said': 1, 'seek': 1, 'bomb': 1, 'would': 1, 'problem': 1, 'using': 1})
Counter({'honored': 2, 'im': 1, 'grateful': 1})
Counter({'bought': 1, 'day': 1, 'jail': 1, 'cell': 1, 'night': 1, 'bomb': 1, 'threat': 1})
Counter({'hundred': 1, 'fifty': 1, 'year': 1, 'ago': 1, 'torn': 1, 'open': 1, 'terrible': 1, 'conflict': 1})
Counter({'chose': 1, 'take': 1, 're

In [18]:
knn_model.vocab

['abedini',
 'ability',
 'able',
 'abroad',
 'absolute',
 'absolutely',
 'absorbing',
 'accommodate',
 'accompanied',
 'according',
 'accountable',
 'accuracy',
 'accused',
 'achievable',
 'achieve',
 'achieved',
 'achievement',
 'acknowledge',
 'acquired',
 'acrimony',
 'across',
 'act',
 'action',
 'active',
 'activity',
 'actual',
 'actually',
 'ad',
 'add',
 'additional',
 'administration',
 'administrator',
 'admiration',
 'admission',
 'admit',
 'adopt',
 'advance',
 'advanced',
 'advancing',
 'advantage',
 'adversary',
 'advice',
 'affect',
 'affection',
 'affiliate',
 'afford',
 'affordable',
 'affront',
 'afghan',
 'afghanistan',
 'afraid',
 'africa',
 'africato',
 'age',
 'agency',
 'aggression',
 'aggressively',
 'ago',
 'agree',
 'agreed',
 'agreement',
 'agrees',
 'ahead',
 'air',
 'airline',
 'airman',
 'airplane',
 'aisle',
 'al',
 'alabama',
 'alexander',
 'alive',
 'allen',
 'alliance',
 'allow',
 'allowed',
 'allowing',
 'allows',
 'ally',
 'almost',
 'alone',
 'along

In [19]:
knn_model.training_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
test_array, test_classes = knn_model.vectorize_df(df=dataframe, text_col='text', class_col='speaker', vectorizer='count')

{}
Counter({'recession': 1, 'marvins': 1, 'competitor': 1, 'closed': 1, 'dozen': 1, 'plant': 1, 'let': 1, 'hundred': 1, 'worker': 1, 'go': 1})
Counter({'one': 2, 'rise': 1, 'fall': 1, 'journey': 1, 'nation': 1, 'people': 1})
Counter({'difference': 2, 'personal': 1})
Counter({'thats': 1, 'weve': 1, 'excluded': 1, 'lobbyist': 1, 'policymaking': 1, 'job': 1, 'seat': 1, 'federal': 1, 'board': 1, 'commission': 1})
Counter({'recognize': 1, 'many': 1, 'still': 1, 'strong': 1, 'desire': 1, 'focus': 1, 'past': 1})
Counter({'many': 2, 'used': 1, 'past': 1, 'president': 1, 'republican': 1, 'democrat': 1, 'decade': 1})
Counter({'al': 1, 'qaeda': 1, 'said': 1, 'seek': 1, 'bomb': 1, 'would': 1, 'problem': 1, 'using': 1})
Counter({'honored': 2, 'im': 1, 'grateful': 1})
Counter({'bought': 1, 'day': 1, 'jail': 1, 'cell': 1, 'night': 1, 'bomb': 1, 'threat': 1})
Counter({'hundred': 1, 'fifty': 1, 'year': 1, 'ago': 1, 'torn': 1, 'open': 1, 'terrible': 1, 'conflict': 1})
Counter({'chose': 1, 'take': 1, 're

In [21]:
predictions = knn_model.predict(test_array=test_array)



In [22]:
# Lets determine the accuracy of the model
# Lets get the number of correct predictions
np.sum(predictions == test_classes) / len(predictions)

0.608

In [23]:
# Lets try that TFIDF vectorizer
knn_tfidf_model = knn(k=5)
knn_tfidf_model.train_on_df(df=dataframe, text_col='text', class_col='speaker', vectorizer='tfidf')
test_array, test_classes = knn_tfidf_model.vectorize_df(df=dataframe, text_col='text', class_col='speaker', vectorizer='tfidf', type='test')

{'abedini': 6.90875477931522, 'ability': 5.812138499293826, 'able': 4.968820743663249, 'abroad': 6.2166061010848646, 'absolute': 6.90875477931522, 'absolutely': 6.90875477931522, 'absorbing': 6.90875477931522, 'accommodate': 6.90875477931522, 'accompanied': 6.90875477931522, 'according': 6.2166061010848646, 'accountable': 5.5254529391317835, 'accuracy': 6.90875477931522, 'accused': 6.90875477931522, 'achievable': 6.90875477931522, 'achieve': 5.812138499293826, 'achieved': 6.2166061010848646, 'achievement': 6.2166061010848646, 'acknowledge': 6.90875477931522, 'acquired': 6.90875477931522, 'acrimony': 6.90875477931522, 'across': 4.836281906951478, 'act': 4.836281906951478, 'action': 6.2166061010848646, 'active': 6.90875477931522, 'activity': 6.90875477931522, 'actual': 6.2166061010848646, 'actually': 5.121977881431629, 'ad': 4.968820743663249, 'add': 5.812138499293826, 'additional': 6.90875477931522, 'administration': 5.303304908059076, 'administrator': 6.90875477931522, 'admiration': 6.

In [None]:
# Man copilot this tfidf stuff has been a real headache
print(knn_tfidf_model.idf_dict)

In [None]:
np.argsort(test_array_for_argpartition)

In [None]:
[]