In [24]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib #for fuller functionality, incl. colors
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from matplotlib.colors import LogNorm
import pandas as pd
from IPython.display import display
import csv
import requests
from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup, Comment
import urllib2
import html2text
import os.path

# Remove Deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# for nicer printing
from prettytable import PrettyTable

#increase number of columns can see in pandas
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

**LOAD DF, GET RID OF "CANDIDATES" WITH ZERO VOTES**

In [2]:
df = pd.DataFrame.from_csv("text_files/senate_results_EDA_states_all_years.csv")

#clean up zero-value "candidates"

df = df[df['votes_state']!=0]

display(df.head(5))
display(df[(df['year']==2014) & (df['state_abbr']=="OK")].head())

Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner
0,2010,Alexander Giannoulias,IL,1719478.0,3703901,1778698,46.4%,0
1,2010,Alvin Greene,SC,364598.0,1318794,810771,27.6%,0
3,2010,Barbara Boxer,CA,5218441.0,10000093,5218441,52.2%,1
4,2010,Barbara Mikulski,MD,1140531.0,1833304,1140531,62.2%,1
7,2010,Billy Wilson,KY,1214.0,1356468,755411,0.1%,0


Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner
290,2014,Aaron DeLozier,OK,7786.0,819679,557537,0.9%,0
375,2014,James Inhofe,OK,557537.0,819679,557537,68.0%,1
386,2014,Joan Farr,OK,10534.0,819679,557537,1.3%,0
421,2014,Matt Silverstein,OK,233932.0,819679,557537,28.5%,0
446,2014,Ray Woods,OK,9890.0,819679,557537,1.2%,0


Idea: apply URL as best as possible from name
for those candidates with > 5% vote share that didn't match to a url, see if can get a match
For all that match, try to pull table

**Determine which names need manual correction**

In [3]:
unmatched_names_dict = {
"Mike Crapo":"Michael_Crapo",
"Bernard Sanders":"Bernie_Sanders",
"John Reed":"Jack_Reed",
"Thomas Carper":"Tom_Carper",
"Mike Crapo":"Michael_Crapo",
"Charles Grassley":"Chuck_Grassley",
"Shelley Capito":"Shelley_Moore_Capito",
"Edward Markey":"Ed_Markey",
"Joe Manchin":"Joe_Manchin_III",
"Jonathan Dine":"Jonathan_Dine",
"Christopher Coons":"Chris_Coons",
"Dan Coats":"Daniel_Coats",
"Robert Casey":"Bob_Casey",
"Timothy Kaine":"Tim_Kaine",
"Russ Feingold":"Russell_Feingold",
"Denny Rehberg":"Dennis_Rehberg",
"William Cassidy":"Bill_Cassidy",
"Alison Lundergan Grimes":"Alison_Grimes",
"Charlie Melancon":"Charles_Melancon",
"Michael Thurmond":"Mike_Thurmond",
"Pete Hoekstra":"Peter_Hoekstra",
"Blanche Lincoln":"Blanche_Lambert_Lincoln",
"Charles Summers":"Charlie_Summers",
"Edward Clifford":"edward-clifford-iii",
"Alexander Pires":"Alex_Pires",
"Alexander Giannoulias":"Alexi_Giannoulias",
"Daniel Bongino":"Dan_Bongino",
"John Kennedy":"John_Neely_Kennedy"
}

** CREATE FULL NAME BY REPLACING SPACES TO URL AND MANUALLY MATCHING NAMES (IF APPROPRIATE DUE TO DIFFERENT URL NAMING CONVENTION **

In [4]:
# add full name
def create_full_name(name) :
    if name in unmatched_names_dict.keys() :
        return unmatched_names_dict[name]
    else :
        return name.replace(" ","_")

df['full_name'] = df['candidate'].apply(create_full_name)

display(df.head())
# turn % votes to a decimal
# df['%_votes_state'] = df['%_votes_state'].apply(string_to_dec)


Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner,full_name
0,2010,Alexander Giannoulias,IL,1719478.0,3703901,1778698,46.4%,0,Alexi_Giannoulias
1,2010,Alvin Greene,SC,364598.0,1318794,810771,27.6%,0,Alvin_Greene
3,2010,Barbara Boxer,CA,5218441.0,10000093,5218441,52.2%,1,Barbara_Boxer
4,2010,Barbara Mikulski,MD,1140531.0,1833304,1140531,62.2%,1,Barbara_Mikulski
7,2010,Billy Wilson,KY,1214.0,1356468,755411,0.1%,0,Billy_Wilson


**GET INFORMATION AND WRITE TO TEXT FILES FOR SUBSQUENT ANALYSIS  
ALSO, CHECK UNMATCHED NAMES**

In [25]:
### GET URLs, WRITE TO FILE ###
#~~UNCOMMENT TO ACTUALLY WRITE~~#

url_names = {}
unmatched_names = []

for full_name, candidate, results, state in zip(df['full_name'],df['candidate'], df['%_votes_state'], df['state_abbr']) :
    try :
        #if file already exists, skip this to take less time
        if os.path.isfile('text_files/'+full_name+'.txt') :
            continue
        
        url = 'http://www.ontheissues.org/Senate/'+name+'.htm'
        response = requests.get(url)
        html = response.content
        soup = BeautifulSoup(html)
        page = urllib2.urlopen(url)
        html_content = page.read()
        #ignore errors to get to write
        html_content = unicode(html_content, errors='ignore')
        rendered_content = html2text.html2text(html_content)
        
        #write to text file to make scrapable
        with open('text_files/'+full_name+'.txt', 'w') as f :
            f.write(rendered_content)
        f.close()
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

        print "GOOD:", candidate, state, url, results
    except :
        unmatched_names.append((candidate, results, state))
        print "BAD:", candidate, state, url, results

BAD: Albert Gore MS http://www.ontheissues.org/Senate/Albert_Gore.htm 40.3%
BAD: David VanDerBeek NV http://www.ontheissues.org/Senate/Albert_Gore.htm 4.9%


In [30]:
#SHOW REMAINING UNMATCHED NAMES
unmatched_names = sorted(unmatched_names, key=lambda x: (x[1]), reverse=True)
for name, result, state in unmatched_names :
    print '"'+name+'":,', result, state
    
unmatched_names_name_only = [i[0] for i in unmatched_names]

"Albert Gore":, 40.3% MS
"David VanDerBeek":, 4.9% NV


**REMOVE "BAD" NAMES FROM DF, ADD IN URL_NAME**

In [95]:
#add the url to the df
def add_url_name(underscore_name) :
    return 'http://www.ontheissues.org/Senate/'+underscore_name+'.htm'

df['url_name'] = df['full_name'].apply(add_url_name)

#determine whether is a matched or unmatched name
def matched_names(candidate_name) :
    if candidate_name in unmatched_names_name_only :
        return "N"
    else :
        return "Y"

print matched_names("Albert Gore")
    
#add to df
df['matched_name'] = df['candidate'].apply(matched_names)

#get rid of unmatched names from data frame as will not be able to pull in any relevant information
df = df[df['matched_name']=="Y"]
    
display(df.head())
display(df)

N


Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner,full_name,url_name,matched_name
0,2010,Alexander Giannoulias,IL,1719478.0,3703901,1778698,46.4%,0,Alexi_Giannoulias,http://www.ontheissues.org/Senate/Alexi_Gianno...,Y
1,2010,Alvin Greene,SC,364598.0,1318794,810771,27.6%,0,Alvin_Greene,http://www.ontheissues.org/Senate/Alvin_Greene...,Y
3,2010,Barbara Boxer,CA,5218441.0,10000093,5218441,52.2%,1,Barbara_Boxer,http://www.ontheissues.org/Senate/Barbara_Boxe...,Y
4,2010,Barbara Mikulski,MD,1140531.0,1833304,1140531,62.2%,1,Barbara_Mikulski,http://www.ontheissues.org/Senate/Barbara_Miku...,Y
7,2010,Billy Wilson,KY,1214.0,1356468,755411,0.1%,0,Billy_Wilson,http://www.ontheissues.org/Senate/Billy_Wilson...,Y


Unnamed: 0,year,candidate,state_abbr,votes,votes_state,max_state,%_votes_state,winner,full_name,url_name,matched_name
0,2010,Alexander Giannoulias,IL,1719478.0,3703901,1778698,46.4%,0,Alexi_Giannoulias,http://www.ontheissues.org/Senate/Alexi_Gianno...,Y
1,2010,Alvin Greene,SC,364598.0,1318794,810771,27.6%,0,Alvin_Greene,http://www.ontheissues.org/Senate/Alvin_Greene...,Y
3,2010,Barbara Boxer,CA,5218441.0,10000093,5218441,52.2%,1,Barbara_Boxer,http://www.ontheissues.org/Senate/Barbara_Boxe...,Y
4,2010,Barbara Mikulski,MD,1140531.0,1833304,1140531,62.2%,1,Barbara_Mikulski,http://www.ontheissues.org/Senate/Barbara_Miku...,Y
7,2010,Billy Wilson,KY,1214.0,1356468,755411,0.1%,0,Billy_Wilson,http://www.ontheissues.org/Senate/Billy_Wilson...,Y
8,2010,Blanche Lincoln,AR,288156.0,779957,451618,36.9%,0,Blanche_Lambert_Lincoln,http://www.ontheissues.org/Senate/Blanche_Lamb...,Y
10,2010,Brad Ellsworth,IN,697775.0,1744221,952116,40.0%,0,Brad_Ellsworth,http://www.ontheissues.org/Senate/Brad_Ellswor...,Y
16,2010,Cam Cavasso,HI,79924.0,370506,277167,21.6%,0,Cam_Cavasso,http://www.ontheissues.org/Senate/Cam_Cavasso.htm,Y
18,2010,Carly Fiorina,CA,4217366.0,10000093,5218441,42.2%,0,Carly_Fiorina,http://www.ontheissues.org/Senate/Carly_Fiorin...,Y
19,2010,Charles Grassley,IA,718215.0,1116063,718215,64.4%,1,Chuck_Grassley,http://www.ontheissues.org/Senate/Chuck_Grassl...,Y


**GET NUMBER OF RESPONSES PER TOPIC DESCRIPTION  
DO THIS TO SEE WHETHER TOPIC TYPES ARE STILL RELEVANT**

In [78]:
### GET URL, WRITE TO FILE ###
#~~UNCOMMENT TO ACTUALLY WRITE~~#


all_topics_list = []

all_data_list = []

#for each relevant candidate, get to relevant information
for name, year in zip(df['full_name'], df['year']) :
    
    with open('text_files/'+name+'.txt') as f :
        text = f.readlines()
    f.close()

    topic_start = False
    data_list = []

    poss_topics = ["topic 1:", "topic 2:","topic 3:","topic 4:","topic 5:","topic 6:","topic 7:","topic 8:",
                  "topic 9:","topic 10:","topic 11:","topic 12:","topic 13:","topic 14:","topic 15:","topic 16:",
                  "topic 17:","topic 18:","topic 19:","topic 20:"]

    topics_dict = {}
    keep_lines = False
    topic_type = None
    priorline_topic = None

    for line in text :
        
        if "topic 1:" in line :
            topic_start = True

        #if are in topics section...
        if topic_start == True :
            
#             if topic == "20" :
#                 print name, topic_descrip, score, line 
            
            #get 2nd line of topic description
            if keep_lines == True :
                #append until hit closing "]" in the 2nd line
                topic_descrip = topic_descrip+" "+line[:line.find("]")]
                topics_dict[topic] = topic_descrip
                #append all the info from the prior topic to all_data_list
                #have to also append here to catch full topic_descrip                
                all_data_list.append((name, year, topic, topic_descrip, topic_type, score))


            #get points and topic type for the question 
            if "points on social scale" in line.lower() or "points on economic scale" in line.lower() :
                score = int(line[line.find("(")+1:line.find("points")-1])

                #get social vs economic type
                if "points on social scale" in line.lower() :
                    topic_type = "Social"
                    ####### IMPLEMENT AGAIN AFTER ONCE DEAL WITH REGULAR TOPIC TYPES!!!!!!!!! #######
                    #if social, multiple by -1 to also make more left-wing more negative like economic scale
#                     score = -1*score
                else :
                    topic_type = "Economic"            

            #if topic number in line...
            if any(t in line for t in poss_topics):
                
                #get the topic number
                topic = line[line.find("topic")+6:line.find(":")]

                #if "]" not in line than topic description is spaced over two lines...
                if "]" not in line :
                    #get the start of topic_descrip to combine later
                    topic_descrip = line[line.find(":")+2:].strip()
                    #keep a line for drawdown
                    keep_lines = True
                else:
                    #pull topic_description fully from current line
                    topic_descrip = line[line.find(":")+2:line.find("]")].strip()
                    topics_dict[topic] = topic_descrip
                    keep_lines = False
                    
                    #append info here if are on a NEW topic 
                    if not priorline_topic == topic :
                        #append all the info from the prior topic to all_data_list
                        all_data_list.append((name, year, topic, topic_descrip, topic_type, score))


            #reset: don't read two lines at once for topic decr any longer as long as not starting a new topic    
            else:
                keep_lines = False

#get all the info from final topic otherwise missed on final loop
#append all the info from the FINAL topic to all_data_list
all_data_list.append((name, year, topic, topic_descrip, topic_type, score))

#get rid of dupes
all_data_list = list(set(all_data_list))

**Display all topics in DataFrame to make simlarity determinations**

In [93]:
############ GET RID OF ALL TOPICS LIST AFTER ############


# print len(all_topics_list)
# print sorted(all_topics_list)

df_topics = pd.DataFrame(all_data_list)
df_topics.columns = ['full_name', 'year', 'topic', 'topic_descrip','topic_type','score']

#concatenate a column for name and year
df_topics['full_name_year'] = df_topics['full_name']+"_"+df_topics['year'].map(str)

def return_max(full_name_year) :
    return df_topics[df_topics['full_name_year']==full_name_year]['score'].max()
    
df_topics['max_score'] = df_topics['full_name_year'].apply(return_max)

display(df_topics)

# df_topics[(df_topics['full_name']=="Tim_Kaine")&(df_topics['year']==2012)]['score'].max()


# df_topics_groupby = df_topics.groupby(['topic', 'topic_descrip']).size().reset_index(name='counts')
# display(df_topics_groupby)

# df_topics_describe = df_topics.groupby(['topic', 'topic_descrip']).describe()
# display(df_topics_describe)

Unnamed: 0,full_name,year,topic,topic_descrip,topic_type,score,full_name_year,max_score
0,Paul_Fiorino,2016,1,Abortion is a woman's unrestricted right,,-5,Paul_Fiorino_2016,5
1,Ernest_Wooton,2010,19,Marijuana is a gateway drug,Economic,2,Ernest_Wooton_2010,5
2,Deborah_Ross,2016,16,Make voter registration easier,Social,-3,Deborah_Ross_2016,5
3,Mitch_McConnell,2014,5,Expand ObamaCare,Social,-3,Mitch_McConnell_2014,5
4,Sheldon_Whitehouse,2012,1,Abortion is a woman's unrestricted right,,2,Sheldon_Whitehouse_2012,5
5,Amy_Klobuchar,2012,17,Avoid foreign entanglements,Social,5,Amy_Klobuchar_2012,5
6,Russell_Feingold,2016,12,Pathway to citizenship for illegal aliens,Economic,-5,Russell_Feingold_2016,5
7,Connie_Mack,2012,17,Seek UN approval for military action,Economic,10,Connie_Mack_2012,10
8,Peter_Moss,2012,2,Legally require hiring women & minorities,Social,-5,Peter_Moss_2012,5
9,Rob_Taylor,2010,1,Abortion is a woman's unrestricted right,,2,Rob_Taylor_2010,5


In [94]:
display(df_topics[df_topics['max_score']==10])

Unnamed: 0,full_name,year,topic,topic_descrip,topic_type,score,full_name_year,max_score
7,Connie_Mack,2012,17,Seek UN approval for military action,Economic,10,Connie_Mack_2012,10
738,Raul_Acosta,2014,9,Mandatory Three Strikes sentencing laws,Social,10,Raul_Acosta_2014,10
739,Connie_Mack,2012,8,Death Penalty,Social,10,Connie_Mack_2012,10
746,Raul_Acosta,2014,12,Immigration helps our economy-encourage it,Economic,10,Raul_Acosta_2014,10
830,Connie_Mack,2012,11,Decrease overall taxation of the wealthy,Social,10,Connie_Mack_2012,10
939,Raul_Acosta,2014,20,Allow churches to provide welfare services,Social,10,Raul_Acosta_2014,10
1052,Raul_Acosta,2014,8,Death Penalty,Social,10,Raul_Acosta_2014,10
1086,Raul_Acosta,2014,4,Permit prayer in public schools,Social,0,Raul_Acosta_2014,10
1440,Raul_Acosta,2014,1,Abortion is a woman's right,,5,Raul_Acosta_2014,10
1789,Raul_Acosta,2014,6,Privatize Social Security,Economic,2,Raul_Acosta_2014,10


In [None]:
"""THROUGH 19"""

aligned_descriptions = [
"Abortion is a woman's right",
"Abortion is a woman's unrestricted right",
"Absolute right to gun ownership",
"Higher taxes on the wealthy",
"Make taxes more progressive",
"Illegal immigrants earn citizenship",
"Immigration helps our economy-encourage it",
"Pathway to citizenship for illegal aliens",
"Support & expand free trade",
"Maintain US sovereignty from UN",
"Support American Exceptionalism",
"Expand the armed forces",
"Expand the military",
"More spending on armed forces",
"Make voter registration easier",
"Avoid foreign entanglements",
"Seek UN approval for military action",
"Stay out of Iran",
"US out of Iraq",
"US out of Iraq & Afghanistan",
"Prioritize green energy",
"Reduce use of coal, oil, & nuclear energy",
"Replace coal & oil with alternatives",
"Drug use is immoral: enforce laws against it",
"Marijuana is a gateway drug",
"Never legalize marijuana",
"Legally require hiring women & minorities",
"Require companies to hire more women & minorities",
"Require hiring more women & minorities",
"Allow churches to provide welfare services",
"Stimulus better than market-led recovery",
"Comfortable with same-sex marriage",
"Same-sex domestic partnership benefits",
"Sexual orientation protected by civil rights laws",
"Keep God in the public sphere",
"Permit prayer in public schools",
"Teacher-led prayer in public schools",
"Expand ObamaCare",
"More federal funding for health coverage",
"Privatize Social Security",
"Parents choose schools via vouchers",
"Vouchers for school choice",
"Death Penalty",
"EPA regulations are too restrictive",
"Human needs over animal rights",
"Society bears cost of pollution",
"Mandatory Three Strikes sentencing laws",
"Stricter punishment reduces crime",
]

#same topic but asked in opposite direction
opposite_aligned_descriptions = [
"Decrease overall taxation of the wealthy",
    
]


#totally different topics; not relevant to main topic
unaligned_descriptions = [
"The Patriot Act harms civil liberties",
"Link human rights to trade with China",
"Reduce spending on missile defense (Star Wars)",
"Stricter limits on political campaign funds",

]

ten_pt_scale = [
    
]

for descrip in df_topics.groupby(['topic', 'topic_descrip']).size().reset_index(name='counts')['topic_descrip'] :
    print descrip

**ISSUES RUN INTO:**
- Any document with Doctype declared "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "" target="_blank"http://www.w3.org/TR/html4/loose.dtd"> would not render text with html2text. Have to html_content = unicode(html_content, errors='ignore')  
- Different questions per year; have to see which ones are applicable
- Improper *ordering* of questions as well within a table, for example http://www.ontheissues.org/Senate/Blanche_Lambert_Lincoln.htm
- Different point scales (see http://www.ontheissues.org/Senate/Connie_Mack.htm) 
