In [1]:
%%time

import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime

PRES_CMTES = ['BENNET FOR AMERICA','BIDEN FOR PRESIDENT','BULLOCK FOR PRESIDENT','PETE FOR AMERICA, INC.','KAMALA HARRIS FOR THE PEOPLE','AMY KLOBUCHAR VICTORY COMMITTEE','WAYNE MESSAM FOR AMERICA, INC.','TIM RYAN FOR AMERICA','WARREN FOR PRESIDENT, INC.','MARIANNE WILLIAMSON FOR PRESIDENT','FRIENDS OF ANDREW YANG','CORY 2020','JULIAN FOR THE FUTURE','FRIENDS OF JOHN DELANEY','TULSI NOW','BETO FOR AMERICA','BERNIE 2020','JOE SESTAK FOR PRESIDENT','TOM STEYER 2020']
CORE_COLUMNS = ['CMTE_NM','RPT_TP','NAME','CITY','STATE','ZIP_CODE','TRANSACTION_DT','TRANSACTION_AMT','PURPOSE','CATEGORY']


# Read in data
df_header = pd.read_csv("~/Documents/Insight/Data - Insight/oppexp_header_file.csv")
df = pd.read_csv("~/Documents/Insight/Data - Insight/oppexp20/oppexp.txt", delimiter="|", low_memory=False, names=df_header.columns, index_col=False)
cmte_header = pd.read_csv("~/Documents/Insight/Data - Insight/cm20/cm_header_file.csv") # pull in header for committee names file
cmte_names = pd.read_csv("~/Documents/Insight/Data - Insight/cm20/cm.txt", delimiter="|", names=cmte_header.columns) # pull in committee names file

# Clean data
df = ( df.set_index('CMTE_ID') ).join(cmte_names.set_index('CMTE_ID')) 
df = df.reset_index()
df['TRANSACTION_DT']=pd.to_datetime(df['TRANSACTION_DT'],errors = 'coerce')
df = df[df['TRANSACTION_AMT']>0]

print('timed')

timed
Wall time: 8.84 s


In [2]:
%%time
df = df[df.apply(lambda x: x.CMTE_NM in PRES_CMTES, axis=1)]

print('timed')

timed
Wall time: 8.04 s


In [21]:
# %%time
# df = df[df.apply(lambda x: 'ACTBLUE' not in x.CMTE_NM, axis=1)]

# print('timed')

timed
Wall time: 885 ms


In [22]:
len(df)

41223

In [4]:
candidates = ['trump','cruz','kasich','rubio','carson','bush','paul','christie','huckabee','fiorina','gilmore','santorum','perry','walker','jindal','graham','pataki']
cmte_names = df.CMTE_NM.unique()
    
for candidate in candidates:
    for name in cmte_names:
        if candidate.upper() in str(name):
            print(name)

In [5]:
candidates = ['bennet','biden','booker','bullock','buttigieg','pete','castro','delaney','gabbard','harris','klobuchar','messam','rourke','ryan','sanders','sestak','steyer','warren','williamson','yang','president','for america','2020','julian for the future']
cmte_names = df.CMTE_NM.unique()
    
for candidate in candidates:
    for name in cmte_names:
        if candidate.upper() in str(name):
            print(name)

BENNET FOR AMERICA
BIDEN FOR PRESIDENT
BULLOCK FOR PRESIDENT
PETE FOR AMERICA, INC.
FRIENDS OF JOHN DELANEY
KAMALA HARRIS FOR THE PEOPLE
AMY KLOBUCHAR VICTORY COMMITTEE
WAYNE MESSAM FOR AMERICA, INC.
TIM RYAN FOR AMERICA
WARREN FOR PRESIDENT, INC.
MARIANNE WILLIAMSON FOR PRESIDENT
FRIENDS OF ANDREW YANG
WARREN FOR PRESIDENT, INC.
MARIANNE WILLIAMSON FOR PRESIDENT
BIDEN FOR PRESIDENT
BULLOCK FOR PRESIDENT
PETE FOR AMERICA, INC.
BETO FOR AMERICA
WAYNE MESSAM FOR AMERICA, INC.
TIM RYAN FOR AMERICA
BENNET FOR AMERICA
CORY 2020
BERNIE 2020
JULIAN FOR THE FUTURE


In [6]:
cmte_names = df.CMTE_NM.unique()

list = []
for candidate in PRES_CMTES:
    for name in cmte_names:
        if candidate.upper() in str(name):
            print(name)
            list.append(candidate)


BENNET FOR AMERICA
BIDEN FOR PRESIDENT
BULLOCK FOR PRESIDENT
PETE FOR AMERICA, INC.
KAMALA HARRIS FOR THE PEOPLE
AMY KLOBUCHAR VICTORY COMMITTEE
WAYNE MESSAM FOR AMERICA, INC.
TIM RYAN FOR AMERICA
WARREN FOR PRESIDENT, INC.
MARIANNE WILLIAMSON FOR PRESIDENT
FRIENDS OF ANDREW YANG
CORY 2020
JULIAN FOR THE FUTURE
FRIENDS OF JOHN DELANEY
TULSI NOW
BETO FOR AMERICA
BERNIE 2020


In [7]:
len(list)
len(PRES_CMTES)
for candidate in PRES_CMTES:
    if candidate not in list:
        print(candidate)

JOE SESTAK FOR PRESIDENT
TOM STEYER 2020


In [23]:
%%time

from sklearn.cluster import DBSCAN
import numpy as np

clustering = DBSCAN(eps=30, min_samples=100).fit_predict(df[['TRANSACTION_AMT']])
#clustering.labels_

print(clustering) 

[0 0 0 ... 0 0 0]
Wall time: 2.5 s


In [24]:
max(clustering)
sum(clustering==-1)
sum(clustering==-1)/42000

0.08023809523809523

In [25]:
outliers = df[clustering==-1]

In [26]:
outliers = outliers.sort_values('TRANSACTION_DT')
outliers.head()

Unnamed: 0,CMTE_ID,AMNDT_IND,RPT_YR,RPT_TP,IMAGE_NUM,LINE_NUM,FORM_TP_CD,SCHED_TP_CD,NAME,CITY,...,CMTE_CITY,CMTE_ST,CMTE_ZIP,CMTE_DSGN,CMTE_TP,CMTE_PTY_AFFILIATION,CMTE_FILING_FREQ,ORG_TP,CONNECTED_ORG_NM,CAND_ID
282478,C00659938,A,2019,Q1,201907159151157829,23,F3P,SB,FINANCIAL INNOVATIONS INC,CRANSTON,...,NEW YORK,NY,10018,P,P,DEM,Q,,,P00006486
331632,C00696054,A,2019,Q1,201907229151624482,23,F3P,SB,ACTBLUE TECHNICAL SERVICES,SOMERVILLE,...,SACRAMENTO,CA,95815,P,P,DEM,Q,,,P00009910
331646,C00696054,A,2019,Q1,201907229151624596,23,F3P,SB,"NGP VAN, INC.",PITTSBURGH,...,SACRAMENTO,CA,95815,P,P,DEM,Q,,,P00009910
331642,C00696054,A,2019,Q1,201907229151624563,23,F3P,SB,"HE LIKES IT, LLC",HILLSBOROUGH,...,SACRAMENTO,CA,95815,P,P,DEM,Q,,,P00009910
331647,C00696054,A,2019,Q1,201907229151624596,23,F3P,SB,"NGP VAN, INC.",PITTSBURGH,...,SACRAMENTO,CA,95815,P,P,DEM,Q,,,P00009910


In [27]:
outliers[CORE_COLUMNS].head()

Unnamed: 0,CMTE_NM,RPT_TP,NAME,CITY,STATE,ZIP_CODE,TRANSACTION_DT,TRANSACTION_AMT,PURPOSE,CATEGORY
282478,FRIENDS OF ANDREW YANG,Q1,FINANCIAL INNOVATIONS INC,CRANSTON,RI,29104009,2018-11-16,8825.86,MERCHANDISE,
331632,MARIANNE WILLIAMSON FOR PRESIDENT,Q1,ACTBLUE TECHNICAL SERVICES,SOMERVILLE,MA,2144,2018-11-18,8340.75,MERCHANT FEES,1.0
331646,MARIANNE WILLIAMSON FOR PRESIDENT,Q1,"NGP VAN, INC.",PITTSBURGH,PA,15251,2018-12-04,22425.0,ONLINE SOFTWARE,1.0
331642,MARIANNE WILLIAMSON FOR PRESIDENT,Q1,"HE LIKES IT, LLC",HILLSBOROUGH,NY,8844,2018-12-05,6800.0,WEBSITE,1.0
331647,MARIANNE WILLIAMSON FOR PRESIDENT,Q1,"NGP VAN, INC.",PITTSBURGH,PA,15251,2018-12-05,3750.0,ONLINE SOFTWARE,1.0


In [20]:
grouped = outliers[['TRANSACTION_AMT','CMTE_NM']].groupby('CMTE_NM')
grouped.count().sort_values('TRANSACTION_AMT')
grouped.sum().sort_values('TRANSACTION_AMT')

Unnamed: 0_level_0,TRANSACTION_AMT
CMTE_NM,Unnamed: 1_level_1
"WAYNE MESSAM FOR AMERICA, INC.",19047.66
TIM RYAN FOR AMERICA,411595.26
BULLOCK FOR PRESIDENT,463871.18
BENNET FOR AMERICA,1178916.61
MARIANNE WILLIAMSON FOR PRESIDENT,2191716.71
TULSI NOW,2338993.19
FRIENDS OF JOHN DELANEY,2736297.27
JULIAN FOR THE FUTURE,2779841.57
FRIENDS OF ANDREW YANG,3966807.18
BETO FOR AMERICA,7685071.33
