# Artificial Intelligence Analysis of U.S. President’s Wikipedia Pages
import packages

In [1]:
import pandas as pd
import numpy as np
import wikipedia
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import string


from bokeh.layouts import row
from bokeh.plotting import figure, show, output_file

Alter stop words and lemmatize/stem

In [2]:
# nltk.download('stopwords')
# nltk.download('wordnet')
stop_words = stopwords.words('english')
stop_words.append("–")
stop_words.append("president")
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

List of presidents

In [3]:
president_list = pd.read_csv("Presidents_List.csv")

Get wikipedia page for each president

In [4]:
president_summary = {}

for president in president_list.President:
    president_summary[president] = wikipedia.page(president).content

Clean wikipedia pages

In [5]:
for key, value in president_summary.items():
    president_summary[key] = value.replace("\n"," ").replace("\'", '').replace('U.S.',"usa").translate(str.maketrans('', '', string.punctuation)).lower()

In [6]:
for key, value in president_summary.items():
    president_summary[key] = [lemmatizer.lemmatize(word) for word in value.split() if word not in stop_words]

In [7]:
docs = [TaggedDocument(words=doc, tags=[president]) for president, doc in president_summary.items()]

Create doc2vec model

In [8]:
model = Doc2Vec(docs, vector_size = 500, window = 10000, epohcs = 10000, min_count = 10, dm = 0)
# model = Doc2Vec(docs, vector_size = 500, window = 500, epohcs = 1000, min_count = 10, dm = 0)

In [9]:
president_vectors = {}

for key, value in president_summary.items():
    president_vectors[key] = model.infer_vector(value)

Example vector

In [10]:
president_vectors["George Washington"]

array([ 0.01831611, -0.47972888,  0.09929533,  0.42860243, -0.22098155,
       -0.08425616, -0.227983  , -0.13757825,  0.47061104,  0.49542436,
        0.43103248, -0.27305454,  0.5245514 ,  0.5202317 , -0.77000225,
        0.38920757,  0.42298615, -0.02251006, -0.07992351,  0.34191126,
       -0.12678705,  0.32932577,  0.11811235,  0.32906502,  0.43123484,
        0.2219002 , -0.17887728, -0.17891474,  0.23089007, -0.54855436,
       -0.64262825,  0.23106219, -0.16969416,  0.00301033,  0.0360023 ,
       -0.5136737 , -0.03666347, -0.21844429,  0.54448223,  0.3697777 ,
       -0.21822606,  0.01797543, -0.23032805,  0.57217115, -0.22993429,
        0.36369598, -0.09561518, -0.05892869, -0.09020491,  0.2584546 ,
       -0.44631273,  0.06878059,  0.31699482,  0.0444587 ,  0.4905105 ,
        0.13186859, -0.13088551,  0.51929176, -0.28135827, -0.56241035,
       -0.08393034, -0.1270318 ,  0.19621807, -0.17325787,  0.00220444,
       -0.23690209,  0.00994045, -0.3285618 ,  0.58687395,  0.35

In [11]:
user_words = input("Enter a phrase and I'll tell you what president it is most closely reltaed to: ")
user_words = user_words.replace("\n"," ").replace("\'", '').replace('U.S.',"usa").translate(str.maketrans('', '', string.punctuation)).lower()

user_words = [lemmatizer.lemmatize(word) for word in user_words.split() if word not in stop_words]

print(user_words)

vector = model.infer_vector(user_words)

model.docvecs.most_similar([vector])

Enter a phrase and I'll tell you what president it is most closely reltaed to: vietnam texas
['vietnam', 'texas']


[('Lyndon B. Johnson', 0.7626705169677734),
 ('Dwight Eisenhower', 0.7375028729438782),
 ('Richard Nixon', 0.7336450815200806),
 ('John F. Kennedy', 0.7012069225311279),
 ('Ronald Reagan', 0.6843741536140442),
 ('Gerald Ford', 0.6612430214881897),
 ('Bill Clinton', 0.6369781494140625),
 ('Calvin Coolidge', 0.6149967908859253),
 ('George W. Bush', 0.6146621704101562),
 ('George Bush', 0.6108454465866089)]

Using a president's own wikipedia page, find the most similar wikipedia page. Put output into bar chart made with bokeh

In [12]:
top10 = model.docvecs.most_similar([president_vectors["Thomas Jefferson"]])

In [13]:
top10_presidents = [pres[0] for pres in top10]
top10_score = [score[1] * 100 for score in top10]

In [14]:
dot = figure(title="Presidential Similarity", toolbar_location=None, y_range=top10_presidents, x_range=[0,100])
dot.segment(0, top10_presidents, top10_score, top10_presidents, line_width=2, line_color="red", )
dot.circle(top10_score, top10_presidents, size=15, fill_color="red", line_color="red", line_width=3, )
dot.yaxis.axis_label = 'President'
dot.xaxis.axis_label = 'Similarity'

In [15]:
show(row(dot, sizing_mode="scale_width"))