In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib #for fuller functionality, incl. colors
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from matplotlib.colors import LogNorm
import pandas as pd
from IPython.display import display
import csv
import requests
from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup, Comment
import urllib2
import html2text
import os.path

# Remove Deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# for nicer printing
from prettytable import PrettyTable

#increase number of columns can see in pandas
pd.set_option('display.max_columns', 100)

**LOAD DF, GET RID OF "CANDIDATES" WITH ZERO VOTES**

In [47]:
df = pd.DataFrame.from_csv("text_files/senate_results_EDA_states_all_years.csv")

#clean up zero-value "candidates"

df = df[df['votes_state']!=0]

display(df.head(5))
display(df[(df['year']==2014) & (df['state_abbr']=="OK")].head())

Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner
0,2010,Alexander Giannoulias,IL,1719478.0,3703901,1778698,46.4%,0
1,2010,Alvin Greene,SC,364598.0,1318794,810771,27.6%,0
3,2010,Barbara Boxer,CA,5218441.0,10000093,5218441,52.2%,1
4,2010,Barbara Mikulski,MD,1140531.0,1833304,1140531,62.2%,1
7,2010,Billy Wilson,KY,1214.0,1356468,755411,0.1%,0


Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner
290,2014,Aaron DeLozier,OK,7786.0,819679,557537,0.9%,0
375,2014,James Inhofe,OK,557537.0,819679,557537,68.0%,1
386,2014,Joan Farr,OK,10534.0,819679,557537,1.3%,0
421,2014,Matt Silverstein,OK,233932.0,819679,557537,28.5%,0
446,2014,Ray Woods,OK,9890.0,819679,557537,1.2%,0


Idea: apply URL as best as possible from name
for those candidates with > 5% vote share that didn't match to a url, see if can get a match
For all that match, try to pull table

**Determine which names need manual correction**

In [48]:
unmatched_names_dict = {
"Mike Crapo":"Michael_Crapo",
"Bernard Sanders":"Bernie_Sanders",
"John Reed":"Jack_Reed",
"Thomas Carper":"Tom_Carper",
"Mike Crapo":"Michael_Crapo",
"Charles Grassley":"Chuck_Grassley",
"Shelley Capito":"Shelley_Moore_Capito",
"Edward Markey":"Ed_Markey",
"Joe Manchin":"Joe_Manchin_III",
"Jonathan Dine":"Jonathan_Dine",
"Christopher Coons":"Chris_Coons",
"Dan Coats":"Daniel_Coats",
"Robert Casey":"Bob_Casey",
"Timothy Kaine":"Tim_Kaine",
"Russ Feingold":"Russell_Feingold",
"Denny Rehberg":"Dennis_Rehberg",
"William Cassidy":"Bill_Cassidy",
"Alison Lundergan Grimes":"Alison_Grimes",
"Albert Gore":"Al_Gore",
"David VanDerBeek":"David_Lory_VanderBeek",
"Charlie Melancon":"Charles_Melancon",
"Michael Thurmond":"Mike_Thurmond",
"Pete Hoekstra":"Peter_Hoekstra",
"Blanche Lincoln":"Blanche_Lambert_Lincoln",
"Charles Summers":"Charlie_Summers",
"Edward Clifford":"edward-clifford-iii",
"Alexander Pires":"Alex_Pires",
"Alexander_Giannoulias":"Alexi_Giannoulias"
}

** CREATE FULL NAME BY REPLACING SPACES TO URL AND MANUALLY MATCHING NAMES (IF APPROPRIATE DUE TO DIFFERENT URL NAMING CONVENTION **

In [49]:
# add full name
def create_full_name(name) :
    if name in unmatched_names_dict.keys() :
        return unmatched_names_dict[name]
    else :
        return name.replace(" ","_")

df['full_name'] = df['candidate'].apply(create_full_name)

display(df.head())
# turn % votes to a decimal
# df['%_votes_state'] = df['%_votes_state'].apply(string_to_dec)


Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner,full_name
0,2010,Alexander Giannoulias,IL,1719478.0,3703901,1778698,46.4%,0,Alexander_Giannoulias
1,2010,Alvin Greene,SC,364598.0,1318794,810771,27.6%,0,Alvin_Greene
3,2010,Barbara Boxer,CA,5218441.0,10000093,5218441,52.2%,1,Barbara_Boxer
4,2010,Barbara Mikulski,MD,1140531.0,1833304,1140531,62.2%,1,Barbara_Mikulski
7,2010,Billy Wilson,KY,1214.0,1356468,755411,0.1%,0,Billy_Wilson


**GET INFORMATION AND WRITE TO TEXT FILES FOR SUBSQUENT ANALYSIS  
ALSO, CHECK UNMATCHED NAMES**

In [50]:
### GET URLs, WRITE TO FILE ###
#~~UNCOMMENT TO ACTUALLY WRITE~~#

url_names = {}
unmatched_names = []

for name, results, state in zip(df['full_name'], df['%_votes_state'], df['state_abbr']) :
    try :
        #if file already exists, skip this to take less time
        if os.path.isfile('text_files/Jay_Townsend.txt') :
            continue
        
        url = 'http://www.ontheissues.org/Senate/'+underscore_name+'.htm'
        response = requests.get(url)
        html = response.content
        soup = BeautifulSoup(html)
        page = urllib2.urlopen(url)
        html_content = page.read()
        #ignore errors to get to write
        html_content = unicode(html_content, errors='ignore')
        rendered_content = html2text.html2text(html_content)
        
        #write to text file to make scrapable
        with open('text_files/'+name+'.txt', 'w') as f :
            f.write(rendered_content)
        f.close()
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

        print "GOOD:", name, state, url, results
    except :
        unmatched_names.append((name, results, state))
        print "BAD:", name, state, url, results

BAD: Alexander_Giannoulias IL http://www.ontheissues.org/Senate/Alexander_Giannoulias.htm 46.4%
GOOD: Alvin_Greene SC http://www.ontheissues.org/Senate/Alvin_Greene.htm 27.6%
GOOD: Barbara_Boxer CA http://www.ontheissues.org/Senate/Barbara_Boxer.htm 52.2%
GOOD: Barbara_Mikulski MD http://www.ontheissues.org/Senate/Barbara_Mikulski.htm 62.2%
BAD: Billy_Wilson KY http://www.ontheissues.org/Senate/Billy_Wilson.htm 0.1%
GOOD: Blanche_Lambert_Lincoln AR http://www.ontheissues.org/Senate/Blanche_Lambert_Lincoln.htm 36.9%
GOOD: Brad_Ellsworth IN http://www.ontheissues.org/Senate/Brad_Ellsworth.htm 40.0%
GOOD: Cam_Cavasso HI http://www.ontheissues.org/Senate/Cam_Cavasso.htm 21.6%
GOOD: Carly_Fiorina CA http://www.ontheissues.org/Senate/Carly_Fiorina.htm 42.2%
GOOD: Chuck_Grassley IA http://www.ontheissues.org/Senate/Chuck_Grassley.htm 64.4%
GOOD: Charles_Schumer NY http://www.ontheissues.org/Senate/Charles_Schumer.htm 66.3%
BAD: Charley_Miller CO http://www.ontheissues.org/Senate/Charley_Mille

TRUE


In [7]:
#SHOW REMAINING UNMATCHED NAMES
unmatched_names = sorted(unmatched_names, key=lambda x: (x[1]), reverse=True)
for name, result, state in unmatched_names :
    print '"'+name+'":,', result, state

"Curt Gottshall":, 8.0% WY
"Paul Strauss":, 77.3% DC
"Mike Crapo":, 71.2% ID
"Bernard Sanders":, 71.1% VT
"John Reed":, 70.7% RI
"Glenda Richmond":, 7.0% DC
"Thomas Carper":, 66.4% DE
"Mike Crapo":, 66.1% ID
"Charles Grassley":, 64.4% IA
"Shelley Capito":, 62.1% WV
"Edward Markey":, 62.0% MA
"Joe Manchin":, 60.5% WV
"Charles Grassley":, 60.2% IA
"David Nolan":, 6.7% AZ
"Dan Cox":, 6.5% MT
"Ray Writz":, 6.2% ID
"Jonathan Dine":, 6.1% MO
"Christopher Coons":, 55.8% DE
"Dan Coats":, 54.6% IN
"Robert Casey":, 53.6% PA
"Timothy Kaine":, 52.5% VA
"Andrew Horning":, 5.8% IN
"Scott Bradley":, 5.7% UT
"Robert Zadek":, 5.6% IL
"Gary Swing":, 5.5% AZ
"Lucy Brenton":, 5.5% IN
"Robert Garrard":, 5.5% KS
"Rebecca Sink-Burris":, 5.4% IN
"Charley Miller":, 5.2% CO
"Trevor Drown":, 5.2% AR
"Jonathan Dine":, 5.1% MO
"John Daniel":, 5.0% DC
"Russ Feingold":, 47.0% WI
"Russ Feingold":, 46.8% WI
"Alexander Giannoulias":, 46.4% IL
"Denny Rehberg":, 44.8% MT
"Thomas Smith":, 44.7% PA
"William Cassidy":, 41.0

In [40]:
### GET URL, WRITE TO FILE ###
#~~UNCOMMENT TO ACTUALLY WRITE~~#

# url = 'http://www.ontheissues.org/Senate/Jack_Conway.htm'
url = 'http://www.ontheissues.org/Senate/Jay_Townsend.htm'    
# url = 'http://www.ontheissues.org/Senate/Harry_Reid.htm'
# url = 'http://www.ontheissues.org/Senate/Jerry_Moran.htm'
# url = 'http://www.ontheissues.org/Senate/fhqgwd.htm'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
page = urllib2.urlopen(url)
html_content = page.read()
html_content = unicode(html_content, errors='ignore')
rendered_content = html2text.html2text(html_content)

#write to text file to make scrapable
with open('text_files/curr_page.txt', 'w') as f :
    f.write(rendered_content)
f.close()
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

with open('text_files/curr_page.txt') as f :
    text = f.readlines()
f.close()

topic_start = False

poss_topics = ["topic 1:", "topic 2:","topic 3:","topic 4:","topic 5:","topic 6:","topic 7:","topic 8:",
              "topic 9:","topic 10:","topic 11:","topic 12:","topic 13:","topic 14:","topic 15:","topic 16:",
              "topic 17:","topic 18:","topic 19:","topic 20:"]

topics_dict = {}
keep_lines = False

for line in text :
    
    if "topic 1:" in line :
        topic_start = True
        
    #if are in topics section...
    if topic_start == True :

        #get 2nd line of topic description
        if keep_lines == True :
            #append until hit closing "]" in the 2nd line
            topic_descrip = topic_descrip+" "+line[:line.find("]")]
            topics_dict[topic] = topic_descrip

        #if a new topic...
        if any(topic in line for topic in poss_topics):
            
            #reset topics_dict
            topics_dict = {}
            
            #get the topic number
            topic = line[line.find("topic")+6:line.find(":")]
            
            #if "]" not in line than topic description is spaced over two lines...
            if "]" not in line :
                #get the start of topic_descrip to combine later
                topic_descrip = line[line.find(":")+2:].strip()
                #keep a line for drawdown
                keep_lines = True
            else:
                #pull topic_description fully from current line
                topic_descrip = line[line.find(":")+2:line.find("]")].strip()
                topics_dict[topic] = topic_descrip
                keep_lines = False
    
        #reset: don't read two lines at once for topic decr any longer as long as not starting a new topic    
        else:
            keep_lines = False

print topics_dict

1
2
3
4
8
9
10
5
6
7
18
19
20
11
12
13
15
16
14
17
{'17': 'US out of Iraq & Afghanistan'}


**ISSUES RUN INTO:**
- Any document with Doctype declared "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "" target="_blank"http://www.w3.org/TR/html4/loose.dtd"> would not render text with html2text. Have to html_content = unicode(html_content, errors='ignore')  
- Different questions per year; have to see which ones are applicable
- Improper *ordering* of questions as well within a table, for example http://www.ontheissues.org/Senate/Blanche_Lambert_Lincoln.htm
