In [59]:
import fecfile
import pandas as pd
# import os
import datetime
# import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker
from dateutil.parser import parse
from IPython.display import display
import geopandas as gpd
import us

Time the notebook

In [60]:
start = datetime.datetime.now()

Define logging function for writing output

In [61]:
def log(obj, description):
    print(description + ":")
    with open("../output/log.txt", "a") as f:
        f.write(f"{description}:\n\n{obj}\n\n---\n")
    return obj

Set some viewing options

In [62]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 500)

Read dataframe of filings

In [63]:
filings = (
    pd.read_csv("../data/filings.csv")
)

filings.head(3)

Unnamed: 0,committee_id,filing_id,cycle,form_type,date_filed,date_coverage_to,date_coverage_from,report_title,report_period,fec_uri,paper,amended,amended_uri,is_amendment,original_filing,original_uri,committee_type,contributions_total,cash_on_hand,disbursements_total,receipts_total
0,C00401224,1190211,2016,F3,2017-11-03,2016-11-28,2016-10-20,POST-GENERAL,PG,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1190211/,False,False,,True,1132265.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1132265/,W,69306148.22,24023158.64,70473625.32,70884967.84
1,C00401224,1167570,2016,F3,2017-06-27,2016-10-19,2016-10-01,PRE-GENERAL,PREG,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1167570/,False,False,,True,1118321.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1118321/,W,38988748.3,23611816.12,36653226.26,41086481.96
2,C00401224,1166534,2016,F3,2017-06-19,2016-12-31,2016-11-29,YEAR-END,YE,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1166534/,False,False,,True,1144458.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1144458/,W,14665549.11,24071137.23,15378810.13,15426788.72


Get only the midyear filing

In [64]:
midyear = (
    filings
    .loc[
        lambda x: x["report_title"].str.contains("MID-YEAR", na = False)
    ]
    # remove filings that have been superceded by subsequent filings
    .loc[
        lambda x: x['amended'] == False
    ]
    [["filing_id",
        "date_filed",
        "date_coverage_to",
        "date_coverage_from",
        "report_title",
        "report_period",
        "fec_uri",
         "cycle"
     ]]
)

midyear.head(3)

Unnamed: 0,filing_id,date_filed,date_coverage_to,date_coverage_from,report_title,report_period,fec_uri,cycle
28,1034155,2015-11-23,2015-06-30,2015-06-23,MID-YEAR,,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1034155/,2016


Load candidates file

In [65]:
candidates = pd.read_csv(
    "../data/candidates.csv"
)

Read dataframe of all actblue contributions

In [69]:
actblue = pd.read_csv(
    "../output/merged_actblue.csv",
    dtype = {
        "filing_id": "object",
        "contributor_organization_name": "object"
    },
    parse_dates = ['contribution_date',]
)

actblue.head(3)

Unnamed: 0,entity_type,filer_committee_id_number,filing_id,transaction_id,contribution_date,contribution_amount,contribution_aggregate,contributor_organization_name,contributor_first_name,contributor_last_name,contributor_street_1,contributor_street_2,contributor_state,contributor_zip_code,contributor_state.1,contributor_employer,contributor_occupation,contribution_purpose_descrip,memo_text_description,donor_id,committee_id,candidate_name,latest_contribution_aggregate
0,IND,C00401224,1034155,SA11AI_28202360,2015-06-29,5.0,35.0,,HOMA,A. HASHEMI,22 CLINTON AVE 9,,CT,6901,CT,NOT EMPLOYED,NOT EMPLOYED,Earmark,Earmarked for DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE (C00000935),HOMA|A HASHEMI|06901,C00000935,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,35.0
1,IND,C00401224,1034155,SA11AI_28249728,2015-06-30,5.0,35.0,,HOMA,A. HASHEMI,22 CLINTON AVE 9,,CT,6901,CT,NOT EMPLOYED,NOT EMPLOYED,Earmark,Earmarked for DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE (C00000935),HOMA|A HASHEMI|06901,C00000935,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,35.0
2,IND,C00401224,1034155,SA11AI_28108486,2015-06-27,25.0,25.0,,SHERI,A. OLSON,4008 SW ARROYO CT,,WA,98146,WA,SELF,ARCHITECT,Earmark,Earmarked for DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE (C00000935),SHERI|A OLSON|98146,C00000935,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,25.0


Read county populations from census

In [70]:
county_pop = (
    pd
    .read_csv(
        "../data/census/ACS_17_5YR_B01003_COUNTY/ACS_17_5YR_B01003_with_ann.csv",
        encoding = "ISO-8859-1",
        skiprows = 1, 
        dtype = {
            "Id2": str,
            "Id": str,
            "Estimate; Total": int
        }
    )
)

place_names = county_pop['Geography'].str.split(",", expand = True)
county_pop['county_name'] = place_names[0]

county_pop.head(3)

Unnamed: 0,Id,Id2,Geography,Estimate; Total,Margin of Error; Total,county_name
0,0500000US01001,1001,"Autauga County, Alabama",55036,*****,Autauga County
1,0500000US01003,1003,"Baldwin County, Alabama",203360,*****,Baldwin County
2,0500000US01005,1005,"Barbour County, Alabama",26201,*****,Barbour County


Make a lookup table of fips to county

In [71]:
# County FIPS code to county name
to_county = dict(
    zip(
        county_pop['Id2'], 
        county_pop["county_name"])
)

# County FIPS code to state name
to_state = dict(
    zip(
        county_pop["Id2"],
        county_pop["Geography"]
    )
)

to_state

{'01001': 'Autauga County, Alabama',
 '01003': 'Baldwin County, Alabama',
 '01005': 'Barbour County, Alabama',
 '01007': 'Bibb County, Alabama',
 '01009': 'Blount County, Alabama',
 '01011': 'Bullock County, Alabama',
 '01013': 'Butler County, Alabama',
 '01015': 'Calhoun County, Alabama',
 '01017': 'Chambers County, Alabama',
 '01019': 'Cherokee County, Alabama',
 '01021': 'Chilton County, Alabama',
 '01023': 'Choctaw County, Alabama',
 '01025': 'Clarke County, Alabama',
 '01027': 'Clay County, Alabama',
 '01029': 'Cleburne County, Alabama',
 '01031': 'Coffee County, Alabama',
 '01033': 'Colbert County, Alabama',
 '01035': 'Conecuh County, Alabama',
 '01037': 'Coosa County, Alabama',
 '01039': 'Covington County, Alabama',
 '01041': 'Crenshaw County, Alabama',
 '01043': 'Cullman County, Alabama',
 '01045': 'Dale County, Alabama',
 '01047': 'Dallas County, Alabama',
 '01049': 'DeKalb County, Alabama',
 '01051': 'Elmore County, Alabama',
 '01053': 'Escambia County, Alabama',
 '01055': 'E

Read zcta to county crosswalk from census

In [72]:
zcta_census = (
    pd
    .read_csv(
        "../data/census/zcta_county.csv", 
        dtype = { 
            "ZCTA5": str, 
            "STATE": str,
            "GEOID": str
        }
    )
)

zcta_census.head()

Unnamed: 0,ZCTA5,STATE,COUNTY,GEOID,POPPT,HUPT,AREAPT,AREALANDPT,ZPOP,ZHU,ZAREA,ZAREALAND,COPOP,COHU,COAREA,COAREALAND,ZPOPPCT,ZHUPCT,ZAREAPCT,ZAREALANDPCT,COPOPPCT,COHUPCT,COAREAPCT,COAREALANDPCT
0,601,72,1,72001,18465,7695,165132671,164333375,18570,7744,167459085,166659789,19483,8125,173777444,172725651,99.43,99.37,98.61,98.6,94.77,94.71,95.03,95.14
1,601,72,141,72141,105,49,2326414,2326414,18570,7744,167459085,166659789,33149,14192,298027589,294039825,0.57,0.63,1.39,1.4,0.32,0.35,0.78,0.79
2,602,72,3,72003,41520,18073,83734431,79288158,41520,18073,83734431,79288158,41959,18258,117948080,79904246,100.0,100.0,100.0,100.0,98.95,98.99,70.99,99.23
3,603,72,5,72005,54689,25653,82063867,81880442,54689,25653,82063867,81880442,60949,28430,195741178,94608641,100.0,100.0,100.0,100.0,89.73,90.23,41.92,86.55
4,606,72,93,72093,6276,2740,94864349,94851862,6615,2877,109592548,109580061,6276,2740,94864349,94851862,94.88,95.24,86.56,86.56,100.0,100.0,100.0,100.0


Some ZIP codes cross state boundaries. The code below finds the main state, by population, associated with any given ZIP Code Tabulation Area.

In [73]:
zcta_states = (
    zcta_census
    .groupby([ "ZCTA5", "STATE"])
    ["ZPOPPCT"]
    .sum()
    .reset_index()
    .sort_values("ZPOPPCT", ascending=False)
    .groupby("ZCTA5")
    ["STATE"]
    .first() # Get the top sorted (largest)
    .apply(
        lambda x: us.states.lookup(x).name
    )
    .to_frame("state")
    .reset_index()
    .rename(
        columns = { "ZCTA5": "contributor_zip_code"} # for easier merging later
    )
)

zcta_states.head()

Unnamed: 0,contributor_zip_code,state
0,601,Puerto Rico
1,602,Puerto Rico
2,603,Puerto Rico
3,606,Puerto Rico
4,610,Puerto Rico


ZIP codes also cross state boundaries. The code below finds the main state, by population, associated with any given ZIP Code Tabulation Area.

In [75]:
def find_county_name(geoId):
    try:
        return county_lookup[geoId]
    except:
        return ""

zcta_counties = (
    zcta_census
    .groupby([ "ZCTA5", "GEOID"]) # County fips code
    ["COPOPPCT"]
    .sum()
    .reset_index()
    .sort_values("COPOPPCT", ascending=False)
    .groupby("ZCTA5")
    ["GEOID"]
    .first() 
    .apply(
        find_county_name
    )
    .to_frame("county_name")
    .reset_index()
    .rename(
        columns = { "ZCTA5": "contributor_zip_code"} # for easier merging later
    )
)

zcta_counties.head()

Unnamed: 0,contributor_zip_code,county_name
0,601,
1,602,
2,603,
3,606,
4,610,


Build a DataFrame of contributors, states and population

In [10]:
# Get number of donors per candidate per zip
zip_donors = (
    actblue
    .groupby(
        ["candidate_name", 
         "contributor_zip_code"]
    )
    .size()
    .to_frame('contributors')
    .reset_index()
)

# Get the state names and zip population using the census dfs
zip_totals = (
    zip_donors
    .merge(
        zcta_states,
        on = "contributor_zip_code",
        how = "left"
    )
    .merge(
        zcta_census[["ZCTA5", "ZPOP"]],
        left_on = "contributor_zip_code",
        right_on = "ZCTA5",
        how = "left"
    )
    .drop( columns = "ZCTA5" )
    .rename(
        columns = {"GEOID": "county_fip", "ZPOP": "zip_population"}
    )
)

zip_totals

Unnamed: 0,candidate_name,contributor_zip_code,contributors,state,county_name,zip_population,county_fip
0,ADAM CLAYTON POWELL FOR CONGRESS,08901,1,New Jersey,,55223,34023
1,BERNIE,08901,21,New Jersey,,55223,34023
2,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,08901,3,New Jersey,,55223,34023
3,DEMOCRATIC SENATORIAL CAMPAIGN COMMITTEE,08901,1,New Jersey,,55223,34023
4,END CITIZENS UNITED PAC,08901,2,New Jersey,,55223,34023
5,ADAM CLAYTON POWELL FOR CONGRESS,10018,1,New York,,5229,36061
6,BERNIE,10018,10,New York,,5229,36061
7,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,10018,3,New York,,5229,36061
8,DEMOCRATIC SENATORIAL CAMPAIGN COMMITTEE,10018,2,New York,,5229,36061
9,END CITIZENS UNITED PAC,10018,1,New York,,5229,36061


In [11]:
def calculate_per_capita(row):
    if int(row["zip_population"]) == 0: return None
    return round(row["contributors"] * 1000.0 / float(row["zip_population"]), 1)

In [12]:
zip_totals["donors_per_1000_people"] = (
    zip_totals
    .apply(
        calculate_per_capita, 
        axis = 1)
    )


Look at Bernie's top zipcodes

In [13]:
(
    zip_totals
    .loc[ lambda x: x["candidate_name"] == "BERNIE" ]
    .sort_values(
        "contributors",
        ascending = False
    )
    .head(10)
)

Unnamed: 0,candidate_name,contributor_zip_code,contributors,state,county_name,zip_population,county_fip,donors_per_1000_people
42473,BERNIE,5401,404,Vermont,,28185,50007,14.3
42629,BERNIE,5602,168,Vermont,,11916,50023,14.1
42427,BERNIE,5301,168,Vermont,,16820,50025,10.0
12448,BERNIE,98103,148,Washington,,45911,53033,3.2
42483,BERNIE,5403,138,Vermont,,17593,50007,7.8
11016,BERNIE,94110,136,California,,69333,6075,2.0
4276,BERNIE,14850,134,New York,,63886,36109,2.1
12532,BERNIE,98115,128,Washington,,46206,53033,2.8
78,BERNIE,10025,127,New York,,94600,36061,1.3
662,BERNIE,11215,123,New York,,63488,36047,1.9


Top zipcodes per capita, excluding Vermont

In [14]:
(
    zip_totals
    .loc[ lambda x: x["candidate_name"] == "BERNIE" ]
    .loc[
        lambda x: (x["zip_population"] >= 1000) & (x["state"] != "Vermont")
    ]
    .sort_values(
        "donors_per_1000_people", 
        ascending = False
    ).head(20)
)

Unnamed: 0,candidate_name,contributor_zip_code,contributors,state,county_name,zip_population,county_fip,donors_per_1000_people
88514,BERNIE,81131,13,Colorado,,1189,8109,10.9
38498,BERNIE,1330,15,Massachusetts,,1506,25011,10.0
40190,BERNIE,2535,11,Massachusetts,,1177,25007,9.3
38373,BERNIE,1098,10,Massachusetts,,1101,25015,9.1
90030,BERNIE,87010,10,New Mexico,,1139,35049,8.8
38298,BERNIE,1072,13,Massachusetts,,1478,25011,8.8
41123,BERNIE,3280,9,New Hampshire,,1123,33019,8.0
96583,BERNIE,95468,10,California,,1258,6045,7.9
95839,BERNIE,94973,10,California,,1429,6041,7.0
41204,BERNIE,3466,9,New Hampshire,,1332,33005,6.8


Merge zip_totals with a GeoDataFrame of counties

In [15]:
zip_totals

Unnamed: 0,candidate_name,contributor_zip_code,contributors,state,county_name,zip_population,county_fip,donors_per_1000_people
0,ADAM CLAYTON POWELL FOR CONGRESS,08901,1,New Jersey,,55223,34023,0.0
1,BERNIE,08901,21,New Jersey,,55223,34023,0.4
2,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,08901,3,New Jersey,,55223,34023,0.1
3,DEMOCRATIC SENATORIAL CAMPAIGN COMMITTEE,08901,1,New Jersey,,55223,34023,0.0
4,END CITIZENS UNITED PAC,08901,2,New Jersey,,55223,34023,0.0
5,ADAM CLAYTON POWELL FOR CONGRESS,10018,1,New York,,5229,36061,0.2
6,BERNIE,10018,10,New York,,5229,36061,1.9
7,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,10018,3,New York,,5229,36061,0.6
8,DEMOCRATIC SENATORIAL CAMPAIGN COMMITTEE,10018,2,New York,,5229,36061,0.4
9,END CITIZENS UNITED PAC,10018,1,New York,,5229,36061,0.2


Why is this happening...

In [16]:
zip_totals.groupby('county_fip')['county_name'].nunique().max()

1

Select for a candidate and counties

In [17]:
bernie = (
    zip_totals
    .loc[
        lambda x: x['candidate_name'] == "BERNIE"
    ]
    .groupby(['county_fip', 'county_name'])
    .agg({
        "contributors": "sum"
    })
    .reset_index()
)

bernie.head()

Unnamed: 0,county_fip,county_name,contributors
0,1001,,3
1,1003,,31
2,1007,,12
3,1009,,11
4,1015,,11


In [18]:
county_geo = (
    gpd
    .read_file("../data/counties-geojson.json")
).merge(
    zip_totals.groupby('',
    left_on = "",
    right_on = "id"
)

county_geo

SyntaxError: invalid syntax (<ipython-input-18-73a5a9e8fbbb>, line 10)

In [19]:
end = datetime.datetime.now()

d = (end - start)

f"The notebook ran for {round(d.total_seconds() / 60, 2) } minutes"

NameError: name 'datetime' is not defined

---

---

---