In [1]:
import fecfile
import pandas as pd
import os
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from dateutil.parser import parse
from IPython.display import display
import geopandas as gpd
import us

Time the notebook

In [2]:
start = datetime.datetime.now()

Define logging function for writing output

In [3]:
def log(obj, description):
    print(description + ":")
    with open("../output/log.txt", "a") as f:
        f.write(f"{description}:\n\n{obj}\n\n---\n")
    return obj

Set some viewing options

In [4]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 500)

Read dataframe of filings

In [5]:
filings = (
    pd.read_csv("../data/filings.csv")
)

filings.head(3)

Unnamed: 0,committee_id,filing_id,cycle,form_type,date_filed,date_coverage_to,date_coverage_from,report_title,report_period,fec_uri,paper,amended,amended_uri,is_amendment,original_filing,original_uri,committee_type,contributions_total,cash_on_hand,disbursements_total,receipts_total
0,C00401224,1190211,2016,F3,2017-11-03,2016-11-28,2016-10-20,POST-GENERAL,PG,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1190211/,False,False,,True,1132265.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1132265/,W,69306148.22,24023158.64,70473625.32,70884967.84
1,C00401224,1167570,2016,F3,2017-06-27,2016-10-19,2016-10-01,PRE-GENERAL,PREG,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1167570/,False,False,,True,1118321.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1118321/,W,38988748.3,23611816.12,36653226.26,41086481.96
2,C00401224,1166534,2016,F3,2017-06-19,2016-12-31,2016-11-29,YEAR-END,YE,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1166534/,False,False,,True,1144458.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1144458/,W,14665549.11,24071137.23,15378810.13,15426788.72


Get only the monthlies

In [6]:
midyear = (
    filings
    .loc[
        lambda x: x["report_title"].str.contains("MID-YEAR", na = False)
    ]
    # remove filings that have been superceded by subsequent filings
    .loc[
        lambda x: x['amended'] == False
    ]
    [["filing_id",
        "date_filed",
        "date_coverage_to",
        "date_coverage_from",
        "report_title",
        "report_period",
        "fec_uri",
         "cycle"
     ]]
)

midyear.head(3)

Unnamed: 0,filing_id,date_filed,date_coverage_to,date_coverage_from,report_title,report_period,fec_uri,cycle
28,1034155,2015-11-23,2015-06-30,2015-06-23,MID-YEAR,,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1034155/,2016


Load candidates file

In [7]:
candidates = pd.read_csv(
    "../data/candidates.csv"
)

Read dataframe of all actblue contributions

In [8]:
actblue = pd.read_csv(
    "../output/merged_actblue.csv",
    dtype = {
        "filing_id": "object",
        "contributor_organization_name": "object"
    },
    parse_dates = ['contribution_date',]
)

actblue.head(3)

Unnamed: 0,entity_type,filer_committee_id_number,filing_id,transaction_id,contribution_date,contribution_amount,contribution_aggregate,contributor_organization_name,contributor_first_name,contributor_last_name,contributor_street_1,contributor_street_2,contributor_state,contributor_zip_code,contributor_state.1,contributor_employer,contributor_occupation,contribution_purpose_descrip,memo_text_description,donor_id,committee_id,candidate_name,latest_contribution_aggregate
0,IND,C00401224,1034155,SA11AI_28202360,2015-06-29,5.0,35.0,,HOMA,A. HASHEMI,22 CLINTON AVE 9,,CT,6901,CT,NOT EMPLOYED,NOT EMPLOYED,Earmark,Earmarked for DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE (C00000935),HOMA|A HASHEMI|06901,C00000935,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,35.0
1,IND,C00401224,1034155,SA11AI_28249728,2015-06-30,5.0,35.0,,HOMA,A. HASHEMI,22 CLINTON AVE 9,,CT,6901,CT,NOT EMPLOYED,NOT EMPLOYED,Earmark,Earmarked for DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE (C00000935),HOMA|A HASHEMI|06901,C00000935,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,35.0
2,IND,C00401224,1034155,SA11AI_28108486,2015-06-27,25.0,25.0,,SHERI,A. OLSON,4008 SW ARROYO CT,,WA,98146,WA,SELF,ARCHITECT,Earmark,Earmarked for DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE (C00000935),SHERI|A OLSON|98146,C00000935,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,25.0


Read zip code populations from census

In [9]:
zcta_pop = (
    pd
    .read_csv(
        "../data/census/ACS_17_5YR_B01003_COUNTY/ACS_17_5YR_B01003_with_ann.csv",
        encoding = "ISO-8859-1",
        skiprows = 1, 
        dtype = {
            "Id2": str,
            "Id": str,
            "Estimate; Total": int
        }
    )
)

zcta_pop.head(3)

Unnamed: 0,Id,Id2,Geography,Estimate; Total,Margin of Error; Total
0,0500000US01001,1001,"Autauga County, Alabama",55036,*****
1,0500000US01003,1003,"Baldwin County, Alabama",203360,*****
2,0500000US01005,1005,"Barbour County, Alabama",26201,*****


Read zcta to county crosswalk from census

In [10]:
zcta_counties = (
    pd
    .read_csv(
        "../data/census/zcta_county.csv", 
        dtype = { 
            "ZCTA5": str, 
            "STATE": str
        }
    )
)

zcta_counties.head(3)

Unnamed: 0,ZCTA5,STATE,COUNTY,GEOID,POPPT,HUPT,AREAPT,AREALANDPT,ZPOP,ZHU,ZAREA,ZAREALAND,COPOP,COHU,COAREA,COAREALAND,ZPOPPCT,ZHUPCT,ZAREAPCT,ZAREALANDPCT,COPOPPCT,COHUPCT,COAREAPCT,COAREALANDPCT
0,601,72,1,72001,18465,7695,165132671,164333375,18570,7744,167459085,166659789,19483,8125,173777444,172725651,99.43,99.37,98.61,98.6,94.77,94.71,95.03,95.14
1,601,72,141,72141,105,49,2326414,2326414,18570,7744,167459085,166659789,33149,14192,298027589,294039825,0.57,0.63,1.39,1.4,0.32,0.35,0.78,0.79
2,602,72,3,72003,41520,18073,83734431,79288158,41520,18073,83734431,79288158,41959,18258,117948080,79904246,100.0,100.0,100.0,100.0,98.95,98.99,70.99,99.23


Some ZIP codes cross state boundaries. The code below finds the main state, by population, associated with any given ZIP Code Tabulation Area.

In [40]:
zcta_states = (
    zcta_counties
    .groupby([ "ZCTA5", "STATE"])
    ["ZPOPPCT"]
    .sum()
    .sort_values("ZPOPPCT", ascending=False)
    .groupby("ZCTA5")["STATE"]
    .first()
#     .apply(lambda x: us.states.lookup(x).name )
#     .to_frame('state')
#     .reset_index()
#     .rename(
#         columns = { "ZCTA5": "contributor_zip_code"} # for easier merging later
#     )
)

zcta_states

ValueError: No axis named ZPOPPCT for object type <class 'pandas.core.series.Series'>

In [None]:
33223

Build a DataFrame of contributors, states and population

In [17]:
# Get number of donors per candidate per zip
zip_donors = (
    actblue
    .groupby(
        ["candidate_name", 
         "contributor_zip_code"]
    )
    .size()
    .to_frame('contributors')
    .reset_index()
)

# Get the state names and zip population using the census dfs
zip_totals = (
    zip_donors
    .merge(
        zcta_states,
        on = "contributor_zip_code",
        how = "left"
    )
    .merge(
        zcta_pop[["Id2", "Estimate; Total"]],
        left_on = "contributor_zip_code",
        right_on = "Id2"
    )
    .drop( columns = "Id2")
    .rename(
        columns = {"Estimate; Total": "population"}
    )
)

In [18]:
def calculate_per_capita(row):
    if int(row["population"]) == 0: return None
    return round(row["contributors"] * 1000.0 / float(row["population"]), 1)

In [19]:
zip_totals["donors_per_1000_people"] = (
    zip_totals
    .apply(
        calculate_per_capita, 
        axis = 1)
    )


Look at Bernie's top zipcodes

In [20]:
(
    zip_totals
    .loc[ lambda x: x["candidate_name"] == "BERNIE" ]
    .sort_values(
        "contributors",
        ascending = False
    )
    .head(10)
)

Unnamed: 0,candidate_name,contributor_zip_code,contributors,state,population,donors_per_1000_people
247,BERNIE,2130,122,Massachusetts,13745,8.9
135,BERNIE,48103,95,Michigan,4836,19.6
320,BERNIE,10003,73,New York,555036,0.1
15,BERNIE,20009,67,District of Columbia,27067,2.5
2732,BERNIE,20005,65,District of Columbia,16466,3.9
1905,BERNIE,5055,54,Vermont,44197,1.2
417,BERNIE,19119,49,Pennsylvania,11745,4.2
2700,BERNIE,19147,48,Pennsylvania,9110,5.3
695,BERNIE,20001,47,District of Columbia,12752,3.7
602,BERNIE,55105,42,Minnesota,161226,0.3


Top zipcodes per capita

In [None]:
(
    zip_totals
    .loc[ lambda x: x["candidate_name"] == "BERNIE" ]
    .loc[
        lambda x: (x["population"] >= 1000) & (x["state"] != "Vermont")
    ]
    .sort_values(
        "donors_per_1000_people", 
        ascending = False
    ).head(20)
)

Merge zip_totals with a GeoDataFrame of counties

In [None]:
county_geo = (
    gpd
    .read_file("../data/states-geojson.json")
    .assign(
        
    )
)

In [None]:
end = datetime.datetime.now()

d = (end - start)

f"The notebook ran for {round(d.total_seconds() / 60, 2) } minutes"

---

---

---