<center><h1>Campaign Finance Capstone: PACs</h1></center>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

import warnings
warnings.filterwarnings('ignore')

### Import Files

In [2]:
pacs_2014 = pd.read_csv('../data/pacs/2014_pac_donations.csv')
pacs_2012 = pd.read_csv('../data/pacs/2012_pac_donations.csv')
pacs_2010 = pd.read_csv('../data/pacs/2010_pac_donations.csv')
pacs_2008 = pd.read_csv('../data/pacs/2008_pac_donations.csv')
pacs_2006 = pd.read_csv('../data/pacs/2006_pac_donations.csv')
pacs_2004 = pd.read_csv('../data/pacs/2004_pac_donations.csv')
pacs_2002 = pd.read_csv('../data/pacs/2002_pac_donations.csv')
pacs_2000 = pd.read_csv('../data/pacs/2000_pac_donations.csv')

In [3]:
pacs = pacs_2014.append([pacs_2012, pacs_2010, pacs_2008, pacs_2006, pacs_2004])
pacs.sort_values(by='amount', ascending=False)

Unnamed: 0,cycle,date_donated,pac_id,pac_name,amount,candidate_id,industry_code,type,recip_code,direct
281022,2012,8/14/12,C00490045,Restore Our Future PAC,9856374,N00009638,J1100,24A,OI,I
281023,2012,8/21/12,C00490045,Restore Our Future PAC,9856374,N00009638,J1100,24A,OI,I
266349,2012,10/26/12,C00487363,American Crossroads,9806703,N00009638,J1100,24A,OI,I
268499,2012,10/2/12,C00487363,American Crossroads,9071481,N00009638,J1100,24A,OI,I
267759,2008,10/23/08,C00003418,Republican National Cmte,9006927,N00006424,Z5100,24C,RP,I
...,...,...,...,...,...,...,...,...,...,...
266182,2010,9/1/09,C00003418,Republican National Cmte,-140000,N00006424,Z5100,24C,RP,I
76616,2004,10/20/04,C00075820,National Republican Congressional Cmte,-150925,N00005233,Z5100,24A,RP,I
9995,2004,9/21/04,C00010603,DNC Services Corp,-152029,N00000245,Z5200,24C,DP,I
263560,2012,12/21/11,C00030718,National Assn of Realtors,-207732,N00007099,F4200,24E,PB,I


Negative numbers are just refunds.

In [4]:
raw_data = pd.read_csv('../data/RAW_candidates.csv')
raw_data.shape

(67999, 17)

In [5]:
raw_data['first_last_party'] = [name[:-4] for name in raw_data['first_last_party']]

# Save a copy of just the candidate names and corresponding id numbers
cid_lookup = raw_data[['cid', 'first_last_party']]
cid_lookup = cid_lookup.drop_duplicates('cid')
cid_lookup = cid_lookup.rename(columns={'cid':'id', 'first_last_party':'candidate_name'})
cid_lookup.head()

Unnamed: 0,id,candidate_name
0,N00005009,Richmond A Soluade Sr
1,N00000948,Lenora B Fulani
2,N00004126,Mark Alan Behnke
3,N00004451,Stephen Bonsal Young
4,N00001670,Raymond J Clatworthy


In [6]:
cid_lookup.shape

(26047, 2)

In [7]:
cid_lookup.to_csv('../data/cid_lookup.csv', index=False)

### Separate party committees & candidate committees from PACs

In [8]:
def candidate_cmte(recip_code):
    r = str(recip_code)
    if (r.startswith('D') or r.startswith('R')) & (not r.endswith('P')):
        return 1
    else:
        return 0

In [9]:
pacs['recip_code'] = pacs['recip_code'].str.upper()

# Create new column to tag party committees
pacs['party_cmte'] = [1 if str(code).endswith('P') else 0 for code in pacs['recip_code']]

# Create new column to tag candidate committees
pacs['candidate_cmte'] = pacs['recip_code'].apply(candidate_cmte)

In [10]:
pacs[(pacs['party_cmte']==1)].sort_values(by='amount', ascending=False)

Unnamed: 0,cycle,date_donated,pac_id,pac_name,amount,candidate_id,industry_code,type,recip_code,direct,party_cmte,candidate_cmte
267759,2008,10/23/08,C00003418,Republican National Cmte,9006927,N00006424,Z5100,24C,RP,I,1,0
14249,2004,10/8/04,C00010603,DNC Services Corp,8100000,N00008072,Z5200,24A,DP,I,1,0
13427,2004,7/28/04,C00010603,DNC Services Corp,8000000,N00000245,Z5200,24E,DP,I,1,0
10933,2004,8/26/04,C00010603,DNC Services Corp,7904589,N00008072,Z5200,24A,DP,I,1,0
11865,2004,10/1/04,C00010603,DNC Services Corp,7900000,N00008072,Z5200,24A,DP,I,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
12385,2004,12/6/04,C00010603,DNC Services Corp,-112165,N00000245,Z5200,24E,DP,I,1,0
266182,2010,9/1/09,C00003418,Republican National Cmte,-140000,N00006424,Z5100,24C,RP,I,1,0
76616,2004,10/20/04,C00075820,National Republican Congressional Cmte,-150925,N00005233,Z5100,24A,RP,I,1,0
9995,2004,9/21/04,C00010603,DNC Services Corp,-152029,N00000245,Z5200,24C,DP,I,1,0


In [11]:
pacs[(pacs['candidate_cmte']==1)].sort_values(by='amount', ascending=False)

Unnamed: 0,cycle,date_donated,pac_id,pac_name,amount,candidate_id,industry_code,type,recip_code,direct,party_cmte,candidate_cmte
99696,2008,8/28/08,C00431569,Hillary Clinton for President,6466214,N00000019,Z9999,24K,DN,D,0,1
162613,2008,3/30/07,C00358895,Friends of Hillary,4750000,N00000019,Z9999,24K,DO,D,0,1
53239,2008,3/30/07,C00347310,Friends of Chris Dodd,4739005,N00000581,Z9999,24K,DN,D,0,1
158498,2008,3/30/07,C00358895,Friends of Hillary,4000000,N00000019,Z9999,24K,DO,D,0,1
246646,2010,3/31/10,C00419291,Sestak for Congress,3027534,N00028049,Z9999,24K,DL,D,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
53968,2008,6/30/07,C00243428,Andrews for Congress Cmte,-5000,N00025450,Z1200,24K,DW,D,0,1
7686,2008,7/12/07,C00265389,Judd Gregg Cmte,-5000,N00007815,Z1100,24K,RI,D,0,1
5589,2014,11/13/13,C00197152,Citizens for Bunning,-5000,N00000286,Z1100,24K,RW,D,0,1
10326,2010,1/15/09,C00443762,Carraro for Congress,-17200,N00015616,Z9999,24K,RN,D,0,1


In [12]:
all_pacs = pacs[(pacs['party_cmte']==0) & (pacs['candidate_cmte']==0)]

all_pacs['date_donated'] = pd.to_datetime(all_pacs['date_donated'])
all_pacs = all_pacs.reset_index()
all_pacs = all_pacs.drop(columns=['index', 'party_cmte', 'candidate_cmte', 'industry_code', 'type', 'direct'])

In [13]:
# Add columns for PAC issue set
all_pacs['business'] = [1 if str(code).endswith('B') else 0 for code in all_pacs['recip_code']]
all_pacs['labor'] = [1 if str(code).endswith('L') else 0 for code in all_pacs['recip_code']]
all_pacs['issues'] = [1 if str(code).endswith('I') else 0 for code in all_pacs['recip_code']]
all_pacs = all_pacs.drop(columns='recip_code')

In [14]:
all_pacs

Unnamed: 0,cycle,date_donated,pac_id,pac_name,amount,candidate_id,business,labor,issues
0,2014,2014-09-05,C00485250,America's Natural Gas Alliance,1000,N00031647,1,0,0
1,2014,2013-12-18,C00030718,National Assn of Realtors,1000,N00013323,1,0,0
2,2014,2013-04-04,C00158576,International Longshoremens Assn,1500,N00031011,0,1,0
3,2014,2013-09-06,C00095869,Edison Electric Institute,2000,N00029835,1,0,0
4,2014,2014-05-28,C00412122,Students for a New American Politics,26,N00035660,0,0,1
...,...,...,...,...,...,...,...,...,...
1710476,2004,2003-12-12,C00035774,Magazine Publishers of America,1000,N00002045,1,0,0
1710477,2004,2004-03-11,C00035774,Magazine Publishers of America,2500,N00005892,1,0,0
1710478,2004,2004-09-14,C00035774,Magazine Publishers of America,2000,N00000010,1,0,0
1710479,2004,2004-11-03,C00035774,Magazine Publishers of America,500,N00000491,1,0,0


### Add Candidate Name Lookup Function

In [15]:
cid_lookup = pd.read_csv('../data/cid_lookup.csv')
cid_lookup

Unnamed: 0,id,candidate_name
0,N00005009,Richmond A Soluade Sr
1,N00000948,Lenora B Fulani
2,N00004126,Mark Alan Behnke
3,N00004451,Stephen Bonsal Young
4,N00001670,Raymond J Clatworthy
...,...,...
26042,N00011680,Helen Ann Mrs Garrels
26043,N00012405,James B Hunt Jr
26044,N00010938,Victor Scott Tiffany
26045,N00037568,Sean Guthrie


In [16]:
global cid_dict

# Convert candidate id lookup table to a dictionary
def DF_to_dict(df):
    i = 0
    id_dict = {}

    while i<len(df):
        key = df.iloc[i, 0]
        value = df.iloc[i, 1]
        id_dict[key] = value
        i += 1

    return id_dict

# Return corresponding candidate name per ID reference
def id_lookup(cid):
    if cid in cid_dict.keys():
        return cid_dict[cid]
    else:
        return np.nan

In [17]:
cid_dict = DF_to_dict(cid_lookup)

In [18]:
all_pacs['candidate_name'] = [id_lookup(cid) for cid in all_pacs['candidate_id']]

In [19]:
all_pacs[all_pacs['candidate_name'].isnull()]

Unnamed: 0,cycle,date_donated,pac_id,pac_name,amount,candidate_id,business,labor,issues,candidate_name
7853,2014,2014-02-26,C00015933,Arizona Public Service Co,25000,,1,0,0,
38714,2014,2014-09-08,C00004275,American Bankers Assn,5000,N00024660,1,0,0,
148505,2014,2013-05-06,C00390575,WellCare Group,13500,,1,0,0,
148598,2014,2014-06-03,C00331694,America Works,25000,,0,0,1,
171731,2014,2014-10-02,C00001198,American Hotel & Motel Assn,1000,N00024660,1,0,0,
...,...,...,...,...,...,...,...,...,...,...
1682399,2004,2003-01-22,C00362624,California Rice Industry Assn Fund,1000,,1,0,0,
1695877,2004,2003-04-12,C00327577,Gateway 2000,500,,1,0,0,
1705446,2004,2003-06-23,C00363879,Entergy Corp,1000,,1,0,0,
1706857,2004,2003-04-10,C00035535,Consumer Bankers Assn,1000,,1,0,0,


In [20]:
# Drop any donations with missing candidate ids
all_pacs = all_pacs.dropna()
all_pacs = all_pacs.reset_index()
all_pacs.drop(columns='index', inplace=True)

In [21]:
all_pacs.shape

(1708982, 10)

---
# Recommender System

In [22]:
pac_donations = all_pacs[all_pacs['amount']>0][['pac_name', 'candidate_name', 'amount']]
pac_donations

Unnamed: 0,pac_name,candidate_name,amount
0,America's Natural Gas Alliance,Mike Kelly,1000
1,National Assn of Realtors,Sam Graves,1000
2,International Longshoremens Assn,Bill Owens,1500
3,Edison Electric Institute,Martin Heinrich,2000
4,Students for a New American Politics,Shenna Bellows,26
...,...,...,...
1708977,Magazine Publishers of America,Tom Davis,1000
1708978,Magazine Publishers of America,Tom DeLay,2500
1708979,Magazine Publishers of America,Dan Burton,2000
1708980,Magazine Publishers of America,Susan Collins,500


In [23]:
pivot = pd.pivot_table(pac_donations, index='pac_name', columns='candidate_name', values='amount')
pivot

candidate_name,A J Sekhon,Aaron Miller,Aaron Russo,Aaron Schock,Aaron Woolf,Abel Maldonado,Abel Tapia,Ada M Fisher,Adam Charles Kokesh,Adam Clayton Powell IV,...,Yolly Roberson,York Jay Kleinhandler,Yvette D. Clarke,Yvonne Rayford Brown,Zach Dasher,Zach Wamp,Zachary T. Space,Zack Matheny,Zelma A Blakes,Zoe Lofgren
pac_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'96 Leadership PAC,,,,,,,,,,,...,,,,,,,,,,
1199 SEIU United Healthcare Workers East,,,,,,,,,,,...,,,,,,,,,,
1199SEIU United Healthcare Workers East,,,,,,,,,,,...,,,,,,,,,,
13th Colony Leadership Cmte,,,,,,,,,,,...,,,,,,,,,,
1776 PAC,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
athenahealth Inc,,,,,,,,,,,...,,,,,,,,,,
eBay Inc,,,,,,,,,,,...,,,,,,,,,,1737.65
eHealth Inc,,,,,,,,,,,...,,,,,,,,,,
icPurple,,,,,,,,,,,...,,,,,,,,,,


In [24]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))

In [25]:
# A distance of 1 is a similarity of 0.
dists = pairwise_distances(sparse_pivot, metric='cosine')
dists

array([[0.        , 1.        , 1.        , ..., 0.9611833 , 1.        ,
        0.93562625],
       [1.        , 0.        , 0.978394  , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.978394  , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.9611833 , 1.        , 1.        , ..., 0.        , 1.        ,
        0.98641729],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.93562625, 1.        , 1.        , ..., 0.98641729, 1.        ,
        0.        ]])

In [26]:
# Similarity is 1 - distance.
similarities = cosine_similarity(sparse_pivot)

In [27]:
recommender_df = pd.DataFrame(dists, 
                              columns=pivot.index, 
                              index=pivot.index)
recommender_df

pac_name,'96 Leadership PAC,1199 SEIU United Healthcare Workers East,1199SEIU United Healthcare Workers East,13th Colony Leadership Cmte,1776 PAC,1789 Project PAC,18-29 PAC,1911 United,19th Star PAC,1Point Administrative Services,...,Zia PAC,Zimmer Inc,Zions Bancorp,Zoetis Inc,Zurich Insurance,athenahealth Inc,eBay Inc,eHealth Inc,icPurple,vRide Inc
pac_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'96 Leadership PAC,0.000000,1.000000,1.000000,0.616604,0.988748,0.914169,1.0,1.000000,0.971039,1.000000,...,0.670028,0.877876,0.971536,0.966724,0.734847,1.0,0.832980,0.961183,1.0,0.935626
1199 SEIU United Healthcare Workers East,1.000000,0.000000,0.978394,1.000000,1.000000,1.000000,1.0,0.725877,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,0.977995,1.0,0.971609,1.000000,1.0,1.000000
1199SEIU United Healthcare Workers East,1.000000,0.978394,0.000000,1.000000,1.000000,1.000000,1.0,0.921181,1.000000,1.000000,...,1.000000,0.904119,1.000000,1.000000,1.000000,1.0,0.890197,1.000000,1.0,1.000000
13th Colony Leadership Cmte,0.616604,1.000000,1.000000,0.000000,0.927444,1.000000,1.0,1.000000,0.984600,1.000000,...,0.731552,0.904018,0.949394,0.976052,0.837335,1.0,0.870634,0.983705,1.0,0.944208
1776 PAC,0.988748,1.000000,1.000000,0.927444,0.000000,1.000000,1.0,1.000000,0.955766,1.000000,...,1.000000,0.938795,0.851505,1.000000,0.929293,1.0,0.878848,0.965107,1.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
athenahealth Inc,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.0,1.000000,1.000000,1.0,1.000000
eBay Inc,0.832980,0.971609,0.890197,0.870634,0.878848,1.000000,1.0,0.955133,0.885475,0.955133,...,0.891767,0.802134,0.785071,0.871774,0.546528,1.0,0.000000,0.767555,1.0,0.923945
eHealth Inc,0.961183,1.000000,1.000000,0.983705,0.965107,1.000000,1.0,1.000000,0.953694,1.000000,...,1.000000,0.821072,0.945918,0.956469,0.839399,1.0,0.767555,0.000000,1.0,0.986417
icPurple,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,0.978179,1.0,1.000000,1.000000,0.0,1.000000


In [30]:
q = 'Electric'
pacs = pivot[pivot.index.str.contains(q)].index

for pac in pacs[:5]:
    print(pac)
    print('Average amount donated $', round(pivot.loc[pac, :].mean(), 2))
    print('Number of donations', pivot.T[pac].count())
    print('')
    print('10 most similar orgs:\n')
    similar_orgs = recommender_df[pac].sort_values().index.tolist()
    for i, org in enumerate(similar_orgs[1:11]):
        print(f"{i+1}. {org}")
    print('')
    print('*******************************************************************************************')
    print('')

Aiken Electric Cooperative
Average amount donated $ 1363.89
Number of donations 6

10 most similar orgs:

1. First Citizens Bancorporation
2. Colonial Life & Accident Insurance
3. Force Protection Inc
4. Inman Mills
5. BNFL Inc
6. Nuclear Fuel Services
7. Sc Peanut Growers Assoc Pac
8. South Carolina Credit Union League
9. West Main Street Values
10. NBSC Corp

*******************************************************************************************

American Electric Power
Average amount donated $ 1302.65
Number of donations 415

10 most similar orgs:

1. Dominion Resources
2. Edison Electric Institute
3. Nuclear Energy Institute
4. Entergy Corp
5. American Gas Assn
6. DTE Energy
7. SBC Communications
8. General Electric
9. National Beer Wholesalers Assn
10. Energy Future Holdings Corp

*******************************************************************************************

Baltimore Gas & Electric
Average amount donated $ 1325.87
Number of donations 267

10 most similar orgs:

