<center>
    <h1>Campaign Finance Capstone Project</h1>
    <h3>Exploratory Data Analysis</h3><br>
    <h5>Encompasses all federal races from 1990-2016</h5>
</center>

## Problem Statement:

To predict the amount of money that needs to be raised to win a congressional or senate race in the next election cycle, based on location and incumbency status.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
file_list = []
root_folder = '../data'

for file in os.listdir(root_folder):
    file_list.append(file.split('.csv')[0]) #import the filenames of all spreadsheets in dataset
    
file_list

['fec_api_committees',
 'pac_to_pacs',
 'individual_contributions',
 'committees',
 'pacs',
 'candidates',
 'backers',
 'industry_codes',
 'politicians',
 'pac_records']

In [3]:
# Read in all .csv files to their own DataFrames
fec_api_committees = pd.read_csv(f"{root_folder}/{file_list[0]}.csv")
pac_to_pacs = pd.read_csv(f"{root_folder}/{file_list[1]}.csv", low_memory=False)
individual_contributions = pd.DataFrame()
committees = pd.read_csv(f"{root_folder}/{file_list[3]}.csv")
pacs = pd.read_csv(f"{root_folder}/{file_list[4]}.csv", low_memory=False)
candidates = pd.read_csv(f"{root_folder}/{file_list[5]}.csv")
backers = pd.read_csv(f"{root_folder}/{file_list[6]}.csv")
industry_codes = pd.read_csv(f"{root_folder}/{file_list[7]}.csv")
politicians = pd.read_csv(f"{root_folder}/{file_list[8]}.csv")
pac_records = pd.read_csv(f"{root_folder}/{file_list[9]}.csv")

In [4]:
# List of DataFrames
DB_list = [fec_api_committees, pac_to_pacs, individual_contributions, committees, pacs, candidates,
           backers, industry_codes, politicians, pac_records]

In [5]:
def add_commas(number): # For DataFrame.shape formatting
    n = str(number)
    
    if len(n)>6:
        return n[:-6]+','+n[-6:-3]+','+n[-3:]
    elif len(n)>3:
        return n[:-3]+','+n[-3:]
    else:
        return number

---
# Overview of all spreadsheets in dataset
#### Contains a printout of the shapes of all DataFrames, as well as column headers. The individual contributions file is processed in a separate notebook.

In [6]:
for i, DB in enumerate(DB_list):
    print(str(i+1)+'. '+file_list[i])
    print(f"[{add_commas(DB.shape[0])} rows - {DB.shape[1]} cols]")
    print('--------')
    print(list(DB.columns))
    print()

1. fec_api_committees
[45,507 rows - 13 cols]
--------
['id', 'cid', 'created_at', 'updated_at', 'cycle', 'individual_unitemized_contributions', 'individual_itemized_contributions', 'individual_contributions', 'designation', 'organization_type', 'name', 'committee_id', 'committee_type']

2. pac_to_pacs
[1,083,525 rows - 25 cols]
--------
['id', 'cycle', 'fec_rec_no', 'filer_id', 'donor_committee', 'contrib_lend_trans', 'city', 'state', 'zip', 'fec_occ_emp', 'prim_code', 'date', 'amount', 'recipient_id', 'party', 'other_id', 'recip_code', 'recip_prim_code', 'amend', 'report', 'pg', 'microfilm', 'type', 'real_code', 'source']

3. individual_contributions
[0 rows - 0 cols]
--------
[]

4. committees
[157,542 rows - 15 cols]
--------
['id', 'cycle', 'committee_id', 'pac_short', 'affiliate', 'ultorg', 'recip_id', 'recip_code', 'fec_cand_id', 'party', 'prim_code', 'source', 'sensitive', 'foreign_owned', 'active']

5. pacs
[3,539,657 rows - 11 cols]
--------
['id', 'cycle', 'fec_rec_no', 'pac

---
# The Candidates
#### Quick data cleaning and filtering by key variables -- political party, win/lose, campaign level (Congressional, Senate, Presidential), and total money raised

In [7]:
candidates['first_last_party'] = [str(name)[0:-4] for name in candidates['first_last_party']]

In [8]:
set([code[-1] for code in candidates['recip_code']])

{' ', 'C', 'I', 'L', 'N', 'O', 'W', 'l', 'w'}

In [9]:
set([code for code in candidates['crpico']])

{' ', '3', 'C', 'I', 'N', 'O', 'R', 'U', 'Y'}

In [10]:
incumbent = []

for code in candidates['crpico']:
    letter = str(code).upper()[-1]
    if letter == 'I': #incumbent
        incumbent.append('y')
    elif letter in ['C', 'O', 'N']: #challenger, open-seat, or non-incumbent
        incumbent.append('n')
    else:
        incumbent.append(np.nan)
        
candidates['incumbent'] = incumbent

In [11]:
win = []

for code in candidates['recip_code']:
    letter = str(code).upper()[-1]
    if letter == 'W': #won
        win.append(1)
    elif letter == 'L': #lost
        win.append(0)
    else:
        win.append(np.nan)
        
candidates['win'] = win

In [12]:
campaign = []

for dist in candidates['dist_id_run_for']:
    if str(dist) == 'PRES':
        campaign.append('PRES')
    elif str(dist)[-2] == 'S':
        campaign.append('SEN')
    else:
        campaign.append('REP')
        
candidates['campaign'] = campaign

In [13]:
# Percent of Congressional, Senate, and Presidential Races
candidates['campaign'].value_counts(normalize=True)

REP     0.752438
SEN     0.157841
PRES    0.089722
Name: campaign, dtype: float64

In [14]:
cols_to_filter = [
    'cycle', 'campaign', 'first_last_party', 'party', 'incumbent', 'dist_id_run_for', 'win', 'raised_total']

In [15]:
# Winning Republican Congressional Candidatess
candidates[
    (candidates['campaign']=='REP')&(candidates['win']==1)&(candidates['party']=='R')
][cols_to_filter].sort_values(by='cycle', ascending=False)

Unnamed: 0,cycle,campaign,first_last_party,party,incumbent,dist_id_run_for,win,raised_total
7623,2014,REP,John J. Duncan Jr,R,y,TN02,1.0,459931
22437,2014,REP,Bruce Westerman,R,n,AR04,1.0,1104034
22381,2014,REP,Bob Gibbs,R,y,OH07,1.0,1074371
22386,2014,REP,Jim Jordan,R,y,OH04,1.0,933080
22393,2014,REP,Thomas Massie,R,y,KY04,1.0,629627
...,...,...,...,...,...,...,...,...
44991,1990,REP,Gary A Franks,R,n,CT05,1.0,445848
44996,1990,REP,Richard D Nichols,R,n,KS05,1.0,310341
45035,1990,REP,Norman F Lent,R,y,NY04,1.0,458103
19714,1990,REP,Andy Ireland,R,y,FL10,1.0,477706


In [16]:
# Losing Republican Congressional Candidatess
candidates[
    (candidates['campaign']=='REP')&(candidates['win']==0)&(candidates['party']=='R')
][cols_to_filter].sort_values(by='cycle', ascending=False)

Unnamed: 0,cycle,campaign,first_last_party,party,incumbent,dist_id_run_for,win,raised_total
575,2016,REP,Daniel Sparks,R,n,MS01,0.0,18133
41439,2016,REP,Mike Tagert,R,n,MS01,0.0,448785
9364,2016,REP,Ed Holliday,R,n,MS01,0.0,122627
13750,2016,REP,Starner Jones,R,n,MS01,0.0,140858
10875,2016,REP,Quentin Whitwell,R,n,MS01,0.0,213814
...,...,...,...,...,...,...,...,...
16196,1990,REP,Karl N Snow Jr,R,n,UT03,0.0,362177
6884,1990,REP,William Joseph (Dub) Lawrence,R,n,UT01,0.0,410
16238,1990,REP,J Kenneth Blackwell,R,n,OH01,0.0,1052797
6840,1990,REP,Robert F Rivard Jr,R,n,GA07,0.0,5005


In [17]:
# Winning Democratic Congressional Candidatess
candidates[
    (candidates['campaign']=='REP')&(candidates['win']==1)&(candidates['party']=='D')
][cols_to_filter].sort_values(by='cycle', ascending=False)

Unnamed: 0,cycle,campaign,first_last_party,party,incumbent,dist_id_run_for,win,raised_total
13767,2014,REP,Elizabeth Esty,D,y,CT05,1.0,2992818
43229,2014,REP,Dutch Ruppersberger,D,y,MD02,1.0,906371
43248,2014,REP,Alma Adams,D,n,NC12,1.0,975536
13813,2014,REP,Debbie Wasserman Schultz,D,y,FL23,1.0,2543524
13821,2014,REP,Norma Torres,D,n,CA35,1.0,476700
...,...,...,...,...,...,...,...,...
8514,1990,REP,Gary A Condit,D,y,CA15,1.0,1027198
19712,1990,REP,JJ Pickle,D,y,TX10,1.0,623918
19672,1990,REP,Ron de Lugo,D,y,VI00,1.0,23650
19670,1990,REP,Tom Lantos,D,y,CA11,1.0,875093


In [18]:
# Winning Republican Senate Candidatess
candidates[
    (candidates['campaign']=='SEN')&(candidates['win']==1)&(candidates['party']=='R')
][cols_to_filter].sort_values(by='cycle', ascending=False)

Unnamed: 0,cycle,campaign,first_last_party,party,incumbent,dist_id_run_for,win,raised_total
51060,2014,SEN,Steven Daines,R,n,MTS2,1.0,5303023
19113,2014,SEN,Joni Ernst,R,n,IAS2,1.0,16206720
26384,2014,SEN,Lamar Alexander,R,y,TNS2,1.0,6229500
43243,2014,SEN,Shelley Moore Capito,R,n,WVS2,1.0,6868714
6070,2014,SEN,Lindsey Graham,R,y,SCS2,1.0,7995142
...,...,...,...,...,...,...,...,...
23574,1990,SEN,Nancy Landon Kassebaum,R,y,KSS1,1.0,678346
46993,1990,SEN,Bob Smith,R,n,NHS2,1.0,1847753
31578,1990,SEN,Larry Pressler,R,y,SDS2,1.0,2673415
38266,1990,SEN,Strom Thurmond,R,y,SCS2,1.0,1458327


In [19]:
# Winning Democratic Senate Candidatess
candidates[
    (candidates['campaign']=='SEN')&(candidates['win']==1)&(candidates['party']=='D')
][cols_to_filter].sort_values(by='cycle', ascending=False)

Unnamed: 0,cycle,campaign,first_last_party,party,incumbent,dist_id_run_for,win,raised_total
767,2014,SEN,Mark Warner,D,y,VAS2,1.0,14894451
17182,2014,SEN,Al Franken,D,y,MNS1,1.0,6884966
31483,2014,SEN,Brian Schatz,D,y,HIS1,1.0,6558313
28589,2014,SEN,Jack Reed,D,y,RIS2,1.0,3528287
13983,2014,SEN,Chris Coons,D,y,DES2,1.0,5042428
...,...,...,...,...,...,...,...,...
31583,1990,SEN,Max Baucus,D,y,MTS2,1.0,2422886
43305,1990,SEN,J Bennett Johnston,D,y,LAS1,1.0,5515776
43306,1990,SEN,Joseph R Biden Jr,D,y,DES2,1.0,1506946
4053,1990,SEN,Carl Levin,D,y,MIS1,1.0,9923387


In [20]:
# If Win is Null, Dropouts ?
candidates[candidates['win'].isna()].sort_values(by='cycle', ascending=False)[cols_to_filter]

Unnamed: 0,cycle,campaign,first_last_party,party,incumbent,dist_id_run_for,win,raised_total
67998,2016,REP,Mario Diaz-Balart,R,y,FL25,,455427
65218,2016,PRES,Timothy Lee Bearson,R,,PRES,,0
65198,2016,REP,Terry Bowman,R,,MI12,,0
65199,2016,REP,Horace Sheffield,D,,MI13,,0
65200,2016,REP,Steve Dunwoody,D,,MI14,,0
...,...,...,...,...,...,...,...,...
52294,1990,REP,Blyth William Daylong,D,,Pres,,0
52295,1990,REP,Rickey Lynn Swinney,U,,Pres,,0
52296,1990,REP,James Mercer Beasley,I,,Pres,,0
52297,1990,REP,Robert Derwood (Buck) Ladner,D,,Pres,,0


In [21]:
candidates['raised_mil'] = candidates['raised_total'] / 1_000_000

In [22]:
# The Incumbency Boost is Extremely Powerful
candidates[
    (candidates['party']=='D') | (candidates['party']=='R')].groupby(by=[
        'campaign', 'party', 'incumbent'])[['win', 'raised_mil']].mean().sort_values(by='incumbent', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,win,raised_mil
campaign,party,incumbent,Unnamed: 3_level_1,Unnamed: 4_level_1
PRES,D,y,1.0,54.048979
PRES,R,y,0.25,18.85991
REP,D,y,0.929304,0.983787
REP,R,y,0.94586,1.083461
SEN,D,y,0.90625,2.770919
SEN,R,y,0.868545,2.367938
PRES,D,n,0.111111,7.7644
PRES,R,n,0.018182,6.042874
REP,D,n,0.05515,0.201037
REP,R,n,0.061595,0.179065


---
# Political Action Committees

In [23]:
# Top corporate donors by millions of dollars
pac_to_pacs['amount_mil'] = pac_to_pacs['amount']/1_000_000
pac_to_pacs.groupby(by='donor_committee')[['amount_mil']].mean().sort_values(by='amount_mil', ascending=False).head(20)

Unnamed: 0_level_0,amount_mil
donor_committee,Unnamed: 1_level_1
"BANK OF AMERICA, NA",12.0
BANK OF GEORGETOWN,6.666667
AMALGAMATED BANK OF NEW YORK,6.166667
WELLS FARGO,6.0
SUMMIT BANK,5.781316
"SMITH, THOMAS",4.225
"SOROS, GEORGE",3.275
"Feinstein, Dianne",3.18
"Watts, Mikal",2.797334
"BING, STEVE",2.746194
