In [1]:
import fecfile
import pandas as pd
import datetime
import re

Time the notebook

In [2]:
start = datetime.datetime.now()

Set some viewing options

In [3]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 500)

## Parse all the raw FEC numbers we wrote to CSV

### But first, some convenience functions..

This function, when given a filing ID, returns only the earmarked contributions from individual donors to candidates:

In [4]:
def extract_contributions(filing_id):
    
    filename = f"../data/filings/{filing_id}-schedule-a.csv"
    
    sched_a = pd.read_csv(
        filename,
        parse_dates = ["contribution_date"],
        dtype = {
            "entity_type": str,
            "filer_committee_id_number": str,
            "transaction_id": str,
            "contribution_amount": float,
            "contribution_aggregate": float,
            "contributor_organization_name": str,
            "contributor_first_name": str,
            "contributor_last_name": str,
            "contributor_street_1": str,
            "contributor_street_2": str,
            "contributor_city": str,
            "contributor_zip_code": str,
            "contributor_state": str,
            "contributor_employer": str,
            "contributor_occupation": str,
            "contribution_purpose_descrip": str,
            "memo_code": str,
            "memo_text_description": str
        }
    )
        
    # remove time zone
    sched_a["contribution_date"] = sched_a["contribution_date"].dt.tz_localize(None)

    return (
        sched_a
        # Extract only individual contributions
        .loc[lambda df: df["entity_type"] == "IND"]
        # Remove memo lines
        .loc[lambda df: df["memo_code"].isnull()]
        # Remove donations that are not for a specific candidate
        .loc[ lambda df: df["contribution_purpose_descrip"] == "Earmark" ]
        .assign(
            filing_id = str(filing_id),
        )
    )

Create a unique ID out of first name, last name and 5-digit ZIP code

In [5]:
def make_donor_ids(df):
    return (
        df
        .assign(
            donor_id = lambda df: (
                df
                .assign(
                    
                    zip5 = lambda df: (
                        df["contributor_zip_code"]
                        .fillna("-----")
                        .str.slice(0, 5)
                    )
                )
                [[
                    "contributor_first_name",
                    "contributor_last_name",
                    "zip5",
                ]]
                .apply(lambda x: (
                    x
                    .fillna("")
                    .astype(str)
                    # Remove periods, commas, extra whitespace
                    .str.replace(r"[\.,\s]+", " ")
                    .str.strip()
                    # Convert everything to upper-case
                    .str.upper()
                ))
                .apply("|".join, axis = 1)
            )
        )
    )

Add a donor name column

In [6]:
def extract_committee_id(df):
    return (
        df
        .assign(
            committee_id = lambda df: df["memo_text_description"].str.extract(r"\((C\d{8})\)", expand = False),
            actblue_committee_name = lambda df: (
                df
                ["memo_text_description"]
                .str.extract(r"^Earmarked for (.*)?\(C\d{8}\)", expand = False)
                .str.strip()
            ),
        )
    )

Load and make ids

In [7]:
all_actblue = (
    extract_contributions("1344765")
    .pipe( make_donor_ids )
    .pipe( extract_committee_id )
)

all_actblue.head()

Unnamed: 0,entity_type,filer_committee_id_number,transaction_id,contribution_date,contribution_amount,contribution_aggregate,contributor_organization_name,contributor_first_name,contributor_last_name,contributor_street_1,contributor_street_2,contributor_city,contributor_zip_code,contributor_state,contributor_employer,contributor_occupation,contribution_purpose_descrip,memo_code,memo_text_description,filing_id,donor_id,committee_id,actblue_committee_name
256,IND,C00401224,SA11AI_146307598,2019-01-31,4.0,24.0,,SUSAN,A - A (MYSELF) SUSAN WEBB,195 LARPENTEUR AVE W MARION ST APA,,ROSEVILLE,55113,MN,APPLE,RETAIL,Earmark,,Earmarked for DCCC (C00000935),1344765,SUSAN|A - A (MYSELF) SUSAN WEBB|55113,C00000935,DCCC
257,IND,C00401224,SA11AI_148902257,2019-02-28,4.0,24.0,,SUSAN,A - A (MYSELF) SUSAN WEBB,195 LARPENTEUR AVE W MARION ST APA,,ROSEVILLE,55113,MN,APPLE,RETAIL,Earmark,,Earmarked for DCCC (C00000935),1344765,SUSAN|A - A (MYSELF) SUSAN WEBB|55113,C00000935,DCCC
260,IND,C00401224,SA11AI_152480924,2019-03-31,4.0,24.0,,SUSAN,A - A (MYSELF) SUSAN WEBB,195 LARPENTEUR AVE W MARION ST APA,,ROSEVILLE,55113,MN,APPLE,RETAIL,Earmark,,Earmarked for DCCC (C00000935),1344765,SUSAN|A - A (MYSELF) SUSAN WEBB|55113,C00000935,DCCC
261,IND,C00401224,SA11AI_156075790,2019-04-30,4.0,24.0,,SUSAN,A - A (MYSELF) SUSAN WEBB,195 LARPENTEUR AVE W MARION ST APA,,ROSEVILLE,55113,MN,APPLE,RETAIL,Earmark,,Earmarked for DCCC (C00000935),1344765,SUSAN|A - A (MYSELF) SUSAN WEBB|55113,C00000935,DCCC
264,IND,C00401224,SA11AI_159499460,2019-05-31,4.0,24.0,,SUSAN,A - A (MYSELF) SUSAN WEBB,195 LARPENTEUR AVE W MARION ST APA,,ROSEVILLE,55113,MN,APPLE,RETAIL,Earmark,,Earmarked for DCCC (C00000935),1344765,SUSAN|A - A (MYSELF) SUSAN WEBB|55113,C00000935,DCCC


In [8]:
len(all_actblue)

11470132

Note: Some committee IDs are either missing or cut off.

In [9]:
(
    all_actblue
    .loc[lambda df: df["actblue_committee_name"].isnull()]
    ["memo_text_description"]
    .value_counts()
)

Earmarked for CHC BOLD PAC/COMMITTEE FOR HISPANIC CAUSES BUILDING OUR LEADERSHIP DIVERSITY (C0036553    105743
Earmarked for SARA GIDEON FOR MAINE ()                                                                   18141
Earmarked for MJ FOR TEXAS ()                                                                            14450
Earmarked for Democratic Nominee for -. Held pursuant to AOs 1977-16 and 1982-23                         13378
Earmarked for SINEMA FOR ARIZONA ()                                                                      11769
Earmarked for Democratic Nominee for KY-. Held pursuant to AOs 1977-16 and 1982-23                       11069
Earmarked for Democratic Nominee for ME-. Held pursuant to AOs 1977-16 and 1982-23                        9896
Earmarked for Democratic Nominee for CO-. Held pursuant to AOs 1977-16 and 1982-23                        8703
Earmarked for Democratic Nominee for NC-. Held pursuant to AOs 1977-16 and 1982-23                        8087
E

Note: In some cases, the same `committee_id` is associated with more than one committee name in the earmarks.

In [10]:
(
    all_actblue
    .groupby(["committee_id"])
    ["actblue_committee_name"]
    .nunique()
    .max()
)

4

In [11]:
(
    all_actblue
    .loc[lambda df: df["committee_id"].isin(
        all_actblue
        .groupby(["committee_id"])
        ["actblue_committee_name"]
        .nunique()
        .loc[lambda x: x > 1]
        .index
    )]
    [[
        "committee_id",
        "actblue_committee_name"
    ]]
    .drop_duplicates()
    .sort_values([ "committee_id", "actblue_committee_name" ])
)

Unnamed: 0,committee_id,actblue_committee_name
256,C00000935,DCCC
1727802,C00000935,DCCC RECOUNT FUND
6014,C00005561,DEMOCRATIC EXECUTIVE COMMITTEE OF FLORIDA - FEDERAL ACCOUNT
327836,C00005561,FLORIDA DEMOCRATIC PARTY RECOUNT FUND - FEDERAL
6279,C00010033,MONTANA DEMOCRATIC LEGISLATIVE CAMPAIGN COMMITTEE
45295,C00010033,MONTANA STATE DEMOCRATIC CENTRAL COMMITTEE - FEDERAL ACCOUNT
8615742,C00010603,COLLEGE DEMOCRATS OF AMERICA / DNC
312,C00010603,DEMOCRATIC NATIONAL COMMITTEE
252279,C00010603,GAY & LESBIAN LEADERSHIP COUNCIL/DNC
30515,C00019331,DEMOCRATIC PARTY OF WISCONSIN - FEDERAL ACCOUNT


In [12]:
assert (
    all_actblue
    .groupby(["actblue_committee_name"])
    ["committee_id"]
    .nunique()
    .max()
) == 1

Get the candidate names using commitee IDs

In [14]:
candidates = pd.read_csv("../data/candidates.csv").rename(
    columns = {
        "Committee ID": "committee_id",
        "Committee Name": "committee_name",
        "Candidate Name": "candidate_name"
    }
)

candidates.head(3)

Unnamed: 0,candidate_name,committee_name,Candidate ID,committee_id
0,Cory Booker,Cory 2020,P00009795,C00695510
1,Kamala Harris,Kamala Harris for the People,P00009423,C00694455
2,Joe Biden,Biden for President,P80000722,C00703975


Merge with candidates to get names

In [15]:
(
    candidates
    .loc[lambda df: ~df["committee_id"].isin(all_actblue["committee_id"].unique())]
)           

Unnamed: 0,candidate_name,committee_name,Candidate ID,committee_id
23,Donald Trump,Donald J. Trump for President,P80001571,C00580100
25,Tom Steyer,Tom Steyer 2020,P00012716,C00711614


In [16]:
with_candidates = (
    candidates
    .loc[lambda df: df["committee_id"].isin(all_actblue["committee_id"].unique())]    
    .merge(
        all_actblue,
        on = "committee_id",
        how = "left",
        validate = "1:m"
    )
)

# Visually inspect to make sure committee names and candidate names match
(
    with_candidates
    [['committee_name', 'actblue_committee_name', 'committee_id']]
    .drop_duplicates()
)

Unnamed: 0,committee_name,actblue_committee_name,committee_id
0,Cory 2020,CORY 2020,C00695510
140043,Kamala Harris for the People,KAMALA HARRIS FOR THE PEOPLE,C00694455
644793,Biden for President,BIDEN FOR PRESIDENT,C00703975
1067513,Amy for America,AMY FOR AMERICA,C00696419
1204475,Bernie 2020,BERNIE 2020,C00696948
3025194,Warren for President,"WARREN FOR PRESIDENT, INC.",C00693234
3837822,Pete for America,"PETE FOR AMERICA, INC",C00697441
4462026,Beto for America,BETO FOR AMERICA,C00699090
4780573,Julián for the Future,JULIAN FOR THE FUTURE,C00693044
4950853,Friends of John Delaney,FRIENDS OF JOHN DELANEY,C00508416


Get the aggregate amount a donor has given to a particular candidate

In [17]:
latest_contribs = (
    with_candidates
    .sort_values('contribution_date')
    .groupby(['donor_id','candidate_name'])
    .pipe(lambda grp: pd.DataFrame({
        "latest_contribution_aggregate": grp["contribution_aggregate"].last(),
        })
    )
    .reset_index()
)

latest_contribs.sort_values(
    'latest_contribution_aggregate',
    ascending = False
).head(3)


Unnamed: 0,donor_id,candidate_name,latest_contribution_aggregate
1784359,LARRY|ROCKEFELLER|10020,Joe Biden,28000.0
1625558,KATHARINE|RAYNER|10021,Pete Buttigieg,20800.0
2184389,MICHAEL|LEWIS|90210,Joe Biden,19600.0


Write out

In [18]:
latest_contribs.to_csv(
    "../output/latest_contribs.csv", 
    index = False
)

Merge with latest_contrib to get latest

In [19]:
clean_actblue = with_candidates.merge(
    latest_contribs,
    on = ["donor_id", 'candidate_name'],
    how = "left"
)

Write out

In [20]:
clean_actblue.to_csv(
    "../output/clean_actblue.csv", 
    index = False)

Time out

In [21]:
end = datetime.datetime.now()

d = (end - start)

f"The notebook ran for {round(d.total_seconds() / 60, 2) } minutes"

'The notebook ran for 20.18 minutes'

---

---

---