In [1]:
# necessary imports
from collections import Counter
from collections import defaultdict
import requests
from bs4 import BeautifulSoup 
from bs4.element import Comment
import re
import pandas as pd
from time import sleep
from Open_Secrets_API import api_key
from itertools import islice
import json
import ast

## Open Secrets: Analysis of Senators' Finances

My goals for the second data analysis project are two-fold: 1) to dig into information about members of congress that is available from Open Secrets, and 2) to become familiar with the Open Secrets API as I'm hoping to use it to generate ideas for potential donor leads for the Montana Environmental Information Center as part of my capstone project.

In part 1 of this project, I pull the names of contributing organizations and the amounts they contributed - both as individuals and via PACs - to each congress person in the US. Then I combine that data with information on each congress person's state, party, and whether they're a senator or representative (and which district they represent). 

In part 2, in the accompanying R Markdown file, I report my findings regarding which organizations contribute the most to each party, and which congress people are the most well-funded.

### Part 1: Data Collection

The Open Secrets API has assigned every congress person a unique ID (CID) that needs to be part of the search query for contributions to that congress person.  

The CID can be scraped from the URL of each member's home page on the Open Secrets website. The following code accomplishes that scrape. 

In [2]:
# take the CID numbers for every congressperson, plus their name

# this site contains links to all members of congress's Open Secrets profiles
url = "https://www.opensecrets.org/members-of-congress/members-list?cong_no=117&cycle=2020"

In [3]:
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text,'html.parser')

In [4]:
# grabs all of links

member_urls = []

for link in soup.find_all('a'):
    member_urls.append(link.get('href'))

The links in the member_urls list provided all of the links from the webpage, many of which I did not need. They also, unfortunately, did not include the congress person's name with the CID. In a very blunt fashion, I copied/pasted the full list of alphabetically listed congress people into an excel spreadsheet. Then I copied/pasted the full list of CIDs in a column next to those names, so I could be sure that the CIDs aligned with the correct person. 

This approach was far from elegant, but it got the job done. I can now read in a file that contains the names of all congress people, their state, party, the district they represent, and their CID. I will use the names and CID columns to create a list that can be used to scrape the contribution information for each congressperson using the Open Secrets API.

In [5]:
# read in the excel to a pandas dataframe 

df_members = pd.read_excel('congress_CID.xlsx')

In [6]:
df_members.head()

Unnamed: 0,politician,state,party,district,cid
0,Ralph Abraham,Louisiana,Republican,District 5,N00036633
1,Alma Adams,North Carolina,Democrat,District 12,N00035451
2,Robert B Aderholt,Alabama,Republican,District 4,N00003028
3,Pete Aguilar,California,Democrat,District 31,N00033997
4,Lamar Alexander,Tennessee,Republican,Senate,N00009888


In [7]:
df_members.to_csv('members.csv')

In [8]:
# pull the cids and names into a dictionary (CIDs are keys; names are values)
cids = df_members.set_index('cid').to_dict()['politician']

In [9]:
# the Open Secret API only allows 200 requests per day, so I need to break up the
# cids dictionary into three chunks (because there are a total of 535 members of congress)
# I'll pull one list each day for three days in a row, writing them to files each day

# This function splits the dictionary (taken from stack overflow, with slight edits)

def split_dict_equally(input_dict, chunks=3):
    "Splits dict by keys. Returns a list of dictionaries."
    # prep with empty dicts
    return_list = [dict() for idx in range(chunks)]
    idx = 0
    for k,v in input_dict.items():
        return_list[idx][k] = v
        if idx < chunks-1:  # indexes start at 0
            idx += 1
        else:
            idx = 0
    return return_list



In [10]:
split_cids = split_dict_equally(cids,chunks=3)

In [11]:
first = split_cids[0]
second = split_cids[1]
third = split_cids[2]

This data scrapes below pull the congress person's name as the key of the dictionary. The values of the dictionary are lists of tuples, which contain the contributing organization's name and the amounts it contributed as individuals, as PACs, and in total. 

In [13]:
# day 1 pull 

contribution_data_first = defaultdict(list)

for key, politician in first.items() :
    link = ''.join('https://www.opensecrets.org/api/?method=candContrib&cid=' + 
               key + 
               '&cycle=2020&apikey=' +
               api_key)

    r = requests.get(link)

    if r.status_code == 200 :
        soup = BeautifulSoup(r.text, 'xml')
        
        
        for item in soup.find_all('contributor') :
            org = item['org_name']
            individ_contribs = item['indivs']
            pac_contribs = item['pacs']
            total_contribs = item['total']
    
            contribution_data_first[politician].append((org, individ_contribs, pac_contribs, total_contribs))
    
    else :
        print(f"We got code {r.status_code} for this linke: {link}")
       

In [12]:
# day 2 pull 

contribution_data_second = defaultdict(list)

for key, politician in second.items() :
    link = ''.join('https://www.opensecrets.org/api/?method=candContrib&cid=' + 
               key + 
               '&cycle=2020&apikey=' +
               api_key)

    r = requests.get(link)

    if r.status_code == 200 :
        soup = BeautifulSoup(r.text, 'xml')
        
        
        for item in soup.find_all('contributor') :
            org = item['org_name']
            individ_contribs = item['indivs']
            pac_contribs = item['pacs']
            total_contribs = item['total']
    
            contribution_data_second[politician].append((org, individ_contribs, pac_contribs, total_contribs))
    
    else :
        print(f"We got code {r.status_code} for this linke: {link}")

In [12]:
# day 3 pull
contribution_data_third = defaultdict(list)

for key, politician in third.items() :
    link = ''.join('https://www.opensecrets.org/api/?method=candContrib&cid=' + 
               key + 
               '&cycle=2020&apikey=' +
               api_key)

    r = requests.get(link)

    if r.status_code == 200 :
        soup = BeautifulSoup(r.text, 'xml')
        
        
        for item in soup.find_all('contributor') :
            org = item['org_name']
            individ_contribs = item['indivs']
            pac_contribs = item['pacs']
            total_contribs = item['total']
    
            contribution_data_third[politician].append((org, individ_contribs, pac_contribs, total_contribs))
    
    else :
        print(f"We got code {r.status_code} for this linke: {link}")

In [13]:
# Write to file (after each pull)

with open('contribution_data_third.txt','w') as outfile:
    #outfile.write('politician\tcontributor\tindividual_amount\tPAC_amount\ttotal_amount\n')
    
    
    for politician in contribution_data_third :
        for data_tuple in contribution_data_third[politician] :
            
            out_line = [politician]
            out_line.extend(data_tuple)
            
            outfile.write('\t'.join(out_line)+'\n')

In [14]:
# combine the data frame the three pulls
filenames = ['contribution_data_first.txt', 'contribution_data_second.txt', 'contribution_data_third.txt']
  
with open('contribution_data.txt', 'w') as outfile:
    
    for names in filenames:
        with open(names) as infile:
            outfile.write(infile.read())

In [15]:
df = pd.read_csv('contribution_data.txt', sep = '\t')

In [16]:
df_members.head()

Unnamed: 0,politician,state,party,district,cid
0,Ralph Abraham,Louisiana,Republican,District 5,N00036633
1,Alma Adams,North Carolina,Democrat,District 12,N00035451
2,Robert B Aderholt,Alabama,Republican,District 4,N00003028
3,Pete Aguilar,California,Democrat,District 31,N00033997
4,Lamar Alexander,Tennessee,Republican,Senate,N00009888


In [17]:
# add in the party, state, and district to the dataframe to the contribution info
contribution_data = df_members.merge(df)
contribution_data.head()

Unnamed: 0,politician,state,party,district,cid,contributor,individual_amount,PAC_amount,total_amount
0,Robert B Aderholt,Alabama,Republican,District 4,N00003028,McWane Inc,38700,0,38700
1,Robert B Aderholt,Alabama,Republican,District 4,N00003028,Collazo Enterprises,22400,0,22400
2,Robert B Aderholt,Alabama,Republican,District 4,N00003028,Southern Co,4309,10000,14309
3,Robert B Aderholt,Alabama,Republican,District 4,N00003028,Leidos Inc,800,13000,13800
4,Robert B Aderholt,Alabama,Republican,District 4,N00003028,"Maynard, Cooper & Gale",3200,10000,13200


In [18]:
# write the df to a csv to be analyzed in R

contribution_data.to_csv('contribution_data.csv')

## Part 2: please see R Markdown Notebook "Data Engineering Project 2 - Campaign Contribution Analysis" 