In [1]:
import sys

sys.path.append("../..")
import pandas as pd
from src.data_loaders.rollcall import (
    get_raw_individual_votes, 
    get_raw_party_membership, 
    get_individual_votes_with_party_enriched, 
    get_individual_votes_with_party_enriched,
    get_trainig_data_v4,
    get_trainig_data_v3
)

In [15]:
df = get_raw_party_membership()
df_dedupe = df[["icpsr", "congress"]].drop_duplicates().sort_values(["icpsr", "congress"])
df["terms_served"] = (df_dedupe.groupby("icpsr")["congress"].cumcount() + 1).to_frame("terms_served")
df["terms_served"].value_counts().head()

terms_served
1.0    12667
2.0     8967
3.0     6452
4.0     4789
5.0     3799
Name: count, dtype: int64

In [16]:
df.columns

Index(['congress', 'chamber', 'icpsr', 'state_icpsr', 'district_code',
       'state_abbrev', 'party_code', 'occupancy', 'last_means', 'bioname',
       'bioguide_id', 'born', 'died', 'nominate_dim1', 'nominate_dim2',
       'nominate_log_likelihood', 'nominate_geo_mean_probability',
       'nominate_number_of_votes', 'nominate_number_of_errors', 'conditional',
       'nokken_poole_dim1', 'nokken_poole_dim2', 'terms_served'],
      dtype='object')

In [3]:
# Confirmed that John Dingell was the longest serving congresssman in the house
df.loc[df["terms_served"] == df["terms_served"].max()].T.loc["bioname"]

47387    DINGELL, John David, Jr.
Name: bioname, dtype: object

In [4]:
house_df = df.loc[df["chamber"] == "House"]
district_count = house_df.groupby(["state_abbrev", "state_icpsr", "congress"])["district_code"].nunique().to_frame('num_reps')
total_district_count = district_count.groupby("congress")["num_reps"].sum().to_frame('num_reps_total')
count_df = district_count.join(total_district_count)

count_df["pct_pop"] = count_df["num_reps"] / count_df["num_reps_total"]
count_df.xs(118, level="congress").sort_values("pct_pop", ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_reps,num_reps_total,pct_pop
state_abbrev,state_icpsr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,71,52,441,0.117914
TX,49,38,441,0.086168
FL,43,28,441,0.063492
NY,13,26,441,0.058957
IL,21,17,441,0.038549


In [5]:
df = df.set_index(["state_icpsr", "congress"]).join(count_df.droplevel('state_abbrev')).reset_index()


In [10]:
df = get_individual_votes_with_party_enriched()
df.head()

Loading cached data from /Users/declannelson/Desktop/columbia/stat5241/stat5241_team_7/src/data/individual_votes_8.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,date,bill_number,vote_passed,vote_type_amend,vote_type_cloture,vote_type_concur,vote_type_conference,vote_type_pass,vote_type_recommit,vote_type_suspend,...,crs_policy_area_taxation,crs_policy_area_transportation_and_public_works,crs_policy_area_water_resources_development,vote_for,d,r,state_icpsr,state_abbrev,terms_served,pct_pop
congress,session,chamber,rollnumber,icpsr,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
101,2.0,0,371,633.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,0,1,True,False,43,FL,21.0,
101,2.0,0,371,1077.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,0,1,True,False,49,TX,19.0,
101,2.0,0,371,1087.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,0,1,False,True,23,MI,17.0,
101,2.0,0,371,2009.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,0,1,False,True,3,MA,16.0,
101,2.0,0,371,2605.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,0,1,True,False,23,MI,18.0,


In [13]:
prior_votes = df.sort_index().groupby(["congress", "rollnumber", "icpsr"])["vote_for"].cumcount().to_frame("prior_votes_for_bill")
prior_votes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,prior_votes_for_bill
congress,session,chamber,rollnumber,icpsr,Unnamed: 5_level_1
101,1.0,1,11,660.0,0
101,1.0,1,11,1252.0,0
101,1.0,1,11,1366.0,0
101,1.0,1,11,4812.0,0
101,1.0,1,11,9369.0,0


In [14]:
df.join(prior_votes)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,date,bill_number,vote_passed,vote_type_amend,vote_type_cloture,vote_type_concur,vote_type_conference,vote_type_pass,vote_type_recommit,vote_type_suspend,...,crs_policy_area_transportation_and_public_works,crs_policy_area_water_resources_development,vote_for,d,r,state_icpsr,state_abbrev,terms_served,pct_pop,prior_votes_for_bill
congress,session,chamber,rollnumber,icpsr,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
101,2.0,0,371,633.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,1,True,False,43,FL,21.0,,0
101,2.0,0,371,1077.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,1,True,False,49,TX,19.0,,0
101,2.0,0,371,1087.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,1,False,True,23,MI,17.0,,0
101,2.0,0,371,2009.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,1,False,True,3,MA,16.0,,0
101,2.0,0,371,2605.0,1990-01-24,hr2712,1,0,0,0,0,0,0,0,...,0,0,1,True,False,23,MI,18.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2.0,1,350,49308.0,2018-01-29,s2311,0,0,1,0,0,0,0,0,...,0,0,0,True,False,73,WA,13.0,,0
115,2.0,1,350,49703.0,2018-01-29,s2311,0,0,1,0,0,0,0,0,...,0,0,0,False,True,2,ME,11.0,,0
115,2.0,1,350,49706.0,2018-01-29,s2311,0,0,1,0,0,0,0,0,...,0,0,1,False,True,68,WY,11.0,,0
115,2.0,1,350,94659.0,2018-01-29,s2311,0,0,1,0,0,0,0,0,...,0,0,1,False,True,41,AL,12.0,,0


In [2]:
features, target_col, df = get_trainig_data_v3()

Loading cached data from /Users/declannelson/Desktop/columbia/stat5241/stat5241_team_7/src/data/individual_votes_9.parquet


In [3]:
df.isna().sum()

vote_for                                                      0
icpsr                                                         0
congress                                                      0
chamber                                                       0
session                                                       0
d                                                             0
r                                                             0
terms_served                                                  0
pct_pop                                                       0
prior_votes_for_bill                                          0
vote_type_amend                                               0
vote_type_cloture                                             0
vote_type_concur                                              0
vote_type_conference                                          0
vote_type_pass                                                0
vote_type_recommit                      

In [4]:
df = pd.read_csv("../data/dime_recipients_1979_2024.csv")

  df = pd.read_csv("../data/dime_recipients_1979_2024.csv")


In [5]:
df.shape

(479502, 64)

In [6]:
df.columns

Index(['election', 'cycle', 'fecyear', 'bonica.rid', 'bonica.cid', 'name',
       'lname', 'ffname', 'fname', 'mname', 'title', 'suffix', 'party',
       'state', 'seat', 'district', 'distcyc', 'ico.status', 'cand.gender',
       'recipient.cfscore', 'recipient.cfscore.dyn', 'contributor.cfscore',
       'dwdime', 'dwnom1', 'dwnom2', 'ps.dwnom1', 'ps.dwnom2', 'irt.cfscore',
       'composite.score', 'num.givers', 'num.givers.total', 'total.receipts',
       'total.disbursements', 'total.indiv.contribs', 'total.unitemized',
       'total.pac.contribs', 'total.party.contribs',
       'total.contribs.from.candidate', 'ind.exp.support', 'ind.exp.oppose',
       'prim.vote.pct', 'pwinner', 'gen.vote.pct', 'gwinner', 's.elec.stat',
       'r.elec.stat', 'district.pres.vs', 'fec.cand.status', 'recipient.type',
       'igcat', 'comtype', 'ICPSR', 'ICPSR2', 'Cand.ID', 'FEC.ID', 'NID',
       'before.switch.ICPSR', 'after.switch.ICPSR', 'party.orig',
       'nimsp.party', 'nimsp.candidate.ICO.co

In [21]:
df.sort_values("cycle").tail(1000).to_csv("../data/dime_example.csv")

In [18]:
df

Unnamed: 0,election,cycle,fecyear,bonica.rid,bonica.cid,name,lname,ffname,fname,mname,...,FEC.ID,NID,before.switch.ICPSR,after.switch.ICPSR,party.orig,nimsp.party,nimsp.candidate.ICO.code,nimsp.district,nimsp.office,nimsp.candidate.status
0,fd1980,1980,1980.0,cand2,2.262643e+09,"whitten, jamie lloyd",whitten,jamie lloyd,jamie,lloyd,...,C00094912,N00003287,,,100.0,,,,,
1,fd1982,1982,1982.0,cand2,2.262643e+09,"whitten, jamie lloyd",whitten,jamie lloyd,jamie,lloyd,...,C00094912,N00003287,,,100.0,,,,,
2,fd1984,1984,1984.0,cand2,2.262643e+09,"whitten, jamie lloyd",whitten,jamie lloyd,jamie,lloyd,...,C00094912,N00003287,,,100.0,,,,,
3,fd1986,1986,1986.0,cand2,2.262643e+09,"whitten, jamie lloyd",whitten,jamie lloyd,jamie,lloyd,...,C00094912,N00003287,,,100.0,,,,,
4,fd1988,1988,1988.0,cand2,2.262643e+09,"whitten, jamie lloyd",whitten,jamie lloyd,jamie,lloyd,...,C00094912,N00003287,,,100.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479497,OH1998,1998,1998.0,cand95076,1.000342e+08,"petro, jim",petro,jim,jim,,...,,,,,200,REPUBLICAN,I,SW,auditor,Won
479498,OH2006,2006,2006.0,cand95348,5.000004e+15,"sykes, barbara",sykes,barbara,barbara,,...,,,,,100,DEMOCRAT,IO,SW,auditor,Lost - General Election
479499,OH2006,2006,2006.0,cand95395,5.000004e+15,"taylor, mary",taylor,mary,mary,,...,,,,,200,REPUBLICAN,IO,SW,auditor,Won
479500,OH2008,2008,2008.0,cand95395,5.000004e+15,"taylor, mary",taylor,mary,mary,,...,,,,,200,REPUBLICAN,I,SW,auditor,Not Up For Election


In [22]:
df["recipient.type"].unique()

array(['cand', 'comm'], dtype=object)