In [1]:
import fecfile
import pandas as pd
import datetime
import re

Time the notebook

In [2]:
start = datetime.datetime.now()

Set some viewing options

In [3]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 500)

Read dataframe of filings

In [4]:
filings = (
    pd.read_csv("../data/filings.csv")
)

filings.head(3)

Unnamed: 0,committee_id,filing_id,cycle,form_type,date_filed,date_coverage_to,date_coverage_from,report_title,report_period,fec_uri,paper,amended,amended_uri,is_amendment,original_filing,original_uri,committee_type,contributions_total,cash_on_hand,disbursements_total,receipts_total
0,C00401224,1190211,2016,F3,2017-11-03,2016-11-28,2016-10-20,POST-GENERAL,PG,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1190211/,False,False,,True,1132265.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1132265/,W,69306148.22,24023158.64,70473625.32,70884967.84
1,C00401224,1167570,2016,F3,2017-06-27,2016-10-19,2016-10-01,PRE-GENERAL,PREG,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1167570/,False,False,,True,1118321.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1118321/,W,38988748.3,23611816.12,36653226.26,41086481.96
2,C00401224,1166534,2016,F3,2017-06-19,2016-12-31,2016-11-29,YEAR-END,YE,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1166534/,False,False,,True,1144458.0,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1144458/,W,14665549.11,24071137.23,15378810.13,15426788.72


Get only the midyear

In [5]:
midyear = (
    filings
#     .loc[
#         lambda x: x['report_period'].str.contains("M", na=False) # if I need monthlies
#     ]
    .loc[
        lambda x: x["report_title"].str.contains("MID-YEAR", na = False)
    ]
    # remove filings that have been superceded by subsequent filings
    .loc[
        lambda x: x['amended'] == False
    ]
    [["filing_id",
        "date_filed",
        "date_coverage_to",
        "date_coverage_from",
        "report_title",
        "report_period",
        "fec_uri",
         "cycle"
     ]]
)

midyear.head(3)

Unnamed: 0,filing_id,date_filed,date_coverage_to,date_coverage_from,report_title,report_period,fec_uri,cycle
28,1034155,2015-11-23,2015-06-30,2015-06-23,MID-YEAR,,http://docquery.fec.gov/cgi-bin/dcdev/forms/C00401224/1034155/,2016


## Parse all the fec files with the filing list

### But first, more convenience functions..

This function, when given a filing ID, returns only the earmarked contributions from individual donors to candidates:

In [6]:
def extract_contributions(filing_id):
    filing = fecfile.from_file(f"../data/filings/{filing_id}.fec")
    meta = filing['filing']
    
    # get only schedule A
    schedule_a = pd.DataFrame(filing["itemizations"]["Schedule A"])
    # If I need to access the summary data
    #summary = pd.DataFrame(filing['summary'])
    
    # remove time zone
    schedule_a["contribution_date"] = schedule_a["contribution_date"].dt.tz_localize(None)
        
    return (
        schedule_a
        # Extract only individual contributions
        .loc[lambda df: df["entity_type"] == "IND"]
        # Remove memo lines
        .loc[lambda df: df["memo_code"] == ""]
        # Remove donations that are not for a specific candidate
        .loc[ lambda df: df["contribution_purpose_descrip"] == "Earmark" ]
        .assign(
            filing_id = str(filing_id),
        )
        [[
            "entity_type",
            "filer_committee_id_number",
            "filing_id",
            "transaction_id",
            "contribution_date",
            "contribution_amount",
            "contribution_aggregate",
            "contributor_organization_name",
            "contributor_first_name",
            "contributor_last_name",
            "contributor_street_1",
            "contributor_street_2",
            "contributor_state",
            "contributor_zip_code",
            "contributor_state",
            "contributor_employer",
            "contributor_occupation",
            "contribution_purpose_descrip",
            "memo_text_description"
        ]]
    )

Create a unique ID out of first name, last name and 5-digit ZIP code

In [7]:
def make_donor_ids(df):
    return (
        df
        .assign(
            donor_id = lambda df: (
                df
                .assign(
                    
                    zip5 = lambda df: (
                        df["contributor_zip_code"]
                        .fillna("-----")
                        .str.slice(0, 5)
                    )
                )
                [[
                    "contributor_first_name",
                    "contributor_last_name",
                    "zip5",
                ]]
                .apply(lambda x: (
                    x
                    .fillna("")
                    .astype(str)
                    # Remove periods, commas, extra whitespace
                    .str.replace(r"[\.,\s]+", " ")
                    .str.strip()
                    # Convert everything to upper-case
                    .str.upper()
                ))
                .apply("|".join, axis = 1)
            )
        )
    )

Add a donor name column

In [8]:
# regex for the commitee ID listed in the memo text
reg = re.compile("\w+(?=\))")

def get_donor_name(df):
    
    return (
        df
        .assign(
            committee_id = lambda frame: frame.apply(
                lambda row: row["memo_text_description"].split("(")[1].split(")")[0],
                axis = 1
            )
        )
    )

Concatenate all the filings data into one big DataFrame, and get individual contributions only

In [9]:
all_actblue = (
    pd
    .concat(
        [ extract_contributions(e) for e in midyear['filing_id'].tolist() ]
    )
    .pipe( make_donor_ids )
    .pipe( get_donor_name )
    
)

In [10]:
len(all_actblue)

277198

Write out before we merge with more candidate data

In [11]:
all_actblue.to_csv("../output/all_actblue.csv", index = False)

Get the candidate names using commitee IDs

In [12]:
candidates = pd.read_csv("../data/candidates.csv").rename(
    columns = {
        "Committee ID": "committee_id",
        "Committee Name": "committee_name",
        "Candidate Name": "candidate_name"
    }
)

candidates.head(3)

Unnamed: 0,candidate_name,committee_name,Candidate ID,committee_id
0,Cory Booker,Cory 2020,P00009795,C00695510
1,Kamala Harris,Kamala Harris for the People,P00009423,C00694455
2,Joe Biden,Biden for President,P80000722,C00703975


Make dummy candidate DataFrame

In [13]:
#regex for getting a name
name_reg = re.compile(r"(?<=for)[A-Z0-9\s]+")
# regex for getting a committee ID
id_reg = re.compile("\w+(?=\))")

def get_dummies(text):    
    # get the name
    committee_name = name_reg.findall(text)[0].strip()
    # get the commitee ID
    match = id_reg.search(text)
    
    if match: # if not None
        cid = match.group()
    else:
        return {}
    
    return { "committee_name": committee_name,
            "candidate_name": committee_name, # This is the same as committee name b/c dummy data
            "committee_id": cid 
           }

memo_strings = all_actblue["memo_text_description"].unique().tolist()
# For every unique memo string, extract the candidate IDs and drop everything else
dummy_candidates = pd.DataFrame(
    [ get_dummies(each) for each in memo_strings]
).dropna()

dummy_candidates.head(3)

Unnamed: 0,committee_name,candidate_name,committee_id
0,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,C00000935
1,DEMOCRATIC SENATORIAL CAMPAIGN COMMITTEE,DEMOCRATIC SENATORIAL CAMPAIGN COMMITTEE,C00042366
2,BERNIE 2016,BERNIE 2016,C00577130


Merge with candidates to get names

In [14]:
with_candidates = (
    all_actblue
    .merge(
        dummy_candidates, # CHANGE TO `candidates`
        on = "committee_id",
        how = "inner" #CHANGE TO LEFT
    )
)


# Visually inspect to make sure committee names and candidate names match
(
    with_candidates
    .groupby('candidate_name')
    .first()
    [['committee_name', 'memo_text_description', 'committee_id']]
)

Unnamed: 0_level_0,committee_name,memo_text_description,committee_id
candidate_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21ST CENTURY DEMOCRATS,21ST CENTURY DEMOCRATS,Earmarked for 21ST CENTURY DEMOCRATS (C00230342),C00230342
314 PAC,314 PAC,Earmarked for 314 PAC (C00567800),C00567800
ADAM CLAYTON POWELL FOR CONGRESS,ADAM CLAYTON POWELL FOR CONGRESS,Earmarked for ADAM CLAYTON POWELL FOR CONGRESS (C00542761),C00542761
ADVANTAGE 2020,ADVANTAGE 2020,Earmarked for ADVANTAGE 2020 (C00572966),C00572966
AKINYEMI AGBEDE FOR U,AKINYEMI AGBEDE FOR U,Earmarked for AKINYEMI AGBEDE FOR U.S. SENATE 2016 (C00573931),C00573931
AL FRANKEN FOR SENATE,AL FRANKEN FOR SENATE,Earmarked for AL FRANKEN FOR SENATE (C00480384),C00480384
ALAN LOWENTHAL FOR CONGRESS,ALAN LOWENTHAL FOR CONGRESS,Earmarked for ALAN LOWENTHAL FOR CONGRESS (C00498212),C00498212
ALEX LAW FOR CONGRESS,ALEX LAW FOR CONGRESS,Earmarked for ALEX LAW FOR CONGRESS (C00569335),C00569335
AMERICA,AMERICA,Earmarked for AMERICA'S YOUTH PAC (C00574475),C00574475
AMERICA WORKS PAC,AMERICA WORKS PAC,Earmarked for AMERICA WORKS PAC (C00331694),C00331694


Get the aggregate amount a donor has given to a particular candidate

In [15]:
latest_contribs = (
    with_candidates
    .sort_values('contribution_date')
    .groupby(['donor_id','candidate_name'])
    .pipe(lambda grp: pd.DataFrame({
        "latest_contribution_aggregate": grp["contribution_aggregate"].last(),
        })
    )
    .reset_index()
)

latest_contribs.sort_values(
    'latest_contribution_aggregate',
    ascending = False
).head(3)


Unnamed: 0,donor_id,candidate_name,latest_contribution_aggregate
122590,KAREN|LAWRENCE|22102,DEMOCRATIC CONGRESSIONAL CAMPAIGN COMMITTEE,60604.22
113739,JONATHAN|LAVINE|02421,JOE KENNEDY VICTORY FUND 2016,38800.0
99112,JEANNE|LAVINE|02421,JOE KENNEDY VICTORY FUND 2016,38800.0


Write out

In [16]:
latest_contribs.to_csv(
    "../output/latest_contribs.csv", 
    index = False
)

Merge with latest_contrib to get latest

In [17]:
clean_actblue = with_candidates.merge(
    latest_contribs,
    on = ["donor_id", 'candidate_name'],
    how = "left"
)

Write out

In [18]:
clean_actblue.to_csv(
    "../output/clean_actblue.csv", 
    index = False)

Time out

In [19]:
end = datetime.datetime.now()

d = (end - start)

f"The notebook ran for {round(d.total_seconds() / 60, 2) } minutes"

'The notebook ran for 2.29 minutes'

---

---

---