In [24]:
import os
import json
import pandas as pd
import numpy as np
from multiprocessing import Pool
import time
from legiscan import LegiScan
from legiscan import LegiScanError
from datetime import datetime
from tqdm import tqdm
tqdm.pandas(desc="Processing rows")

import gzip


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
file_path = "./Data/alignments_notext.csv"

In [17]:
chunk_size = 10000000  #



def process_chunk(chunk):
    '''  This function reads the text alignment data chunk by chunk.
       For each chunk, we group by the state of the left and right bill to find the 
          state pair's aggregate alignment scores. We update the average alignment score for each chunk 
             read and grouped   '''

    # Create new columns for left and right state by slicing the first two characters of 'left_id' and 'right_id'
    chunk['left_state'] = chunk['left_id'].str[:2]
    chunk['right_state'] = chunk['right_id'].str[:2]
    
    # Group by the state pairs and calculate mean, sum, and median for both 'score' and 'adjusted_alignment_score'
    grouped = chunk.groupby(['left_state', 'right_state']).agg({
        'score': ['mean', 'sum', 'median'],
        'adjusted_alignment_score': ['mean', 'sum', 'median']
    }).reset_index()
    # Flatten MultiIndex columns after groupby aggregation
    grouped.columns = ['_'.join(col).strip() if col[1] else col[0] for col in grouped.columns.values]
    
    return grouped

#initialize empty dataframe
aggregated_df = pd.DataFrame()

#begin processing chunks
i = 0
for chunk in pd.read_csv(file_path, usecols=['left_id', 'right_id', 'score', 'adjusted_alignment_score'], chunksize=chunk_size):
    aggregated_chunk = process_chunk(chunk)
    aggregated_df = pd.concat([aggregated_df, aggregated_chunk], ignore_index=True)
    i += 1
    print(i)


# Final aggregation
final_results = aggregated_df.groupby(['left_state', 'right_state']).agg({
    'score_mean': 'mean',
    'score_sum': 'sum',
    'score_median': 'median',
    'adjusted_alignment_score_mean': 'mean',
    'adjusted_alignment_score_sum': 'sum',
    'adjusted_alignment_score_median': 'median'
}).reset_index()


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


In [20]:
# final_results.to_excel('./Data/aggregate_alignments.xlsx', index = False)

In [123]:
final_results.head(2)

Unnamed: 0,left_state,right_state,score_mean,score_sum,score_median,adjusted_alignment_score_mean,adjusted_alignment_score_sum,adjusted_alignment_score_median
0,ak,al,22.410536,108367.0,21.0,12.867313,62151.8949,10.4475
1,ak,ar,22.692757,63621.0,22.0,10.757733,30178.1415,8.888


In [29]:
def read_json(file_path):
    ''' This function is for the policy coverage by each state.
     We read the raw bills data by each line, corresponding to each document
    If bill text is available for the document we append state and year for the bill in a list 
    Then convert the lists into a dataframe '''

    year_col = []
    state_col = []
    start = time.time()


    with open(file_path, 'r', encoding='utf-8') as file:
        line_count = 0  # Initialize line counter
        for line in file:
            try:
                bill = json.loads(line.strip())
                if (bill.get('bill_document_first') is None) and (bill.get('bill_document_last') is None):
                    continue

                # Convert 'date_created' to datetime object and check if within range
                date_created_str = bill.get('date_created')
                if date_created_str:
                    date_created = datetime.strptime(date_created_str, '%Y-%m-%d %H:%M:%S')
                    year = date_created.year
                else:
                    year = np.nan

                state = bill.get('state')
                if state:
                    state_name = state
                else:
                    state_name = np.nan

                year_col.append(year)
                state_col.append(state_name)
                line_count += 1
                if line_count % 10000 == 0: 
                    print(line_count)
                    end = time.time()
                    time_taken = (end - start) / 60
                    print('Time taken to process 10000 lines %s minutes' % time_taken)
                    start = time.time()
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

    state_year = pd.DataFrame({'state': state_col, 'year': year_col})

    return state_year

In [30]:
state_year = read_json('./Data/state_bills.json')

10000
Time taken to process 10000 lines 0.013874638080596923 minutes
20000
Time taken to process 10000 lines 0.015050045649210612 minutes
30000
Time taken to process 10000 lines 0.0157191793123881 minutes
40000
Time taken to process 10000 lines 0.014189803600311279 minutes
50000
Time taken to process 10000 lines 0.013550794124603272 minutes
60000
Time taken to process 10000 lines 0.01651686429977417 minutes
70000
Time taken to process 10000 lines 0.02893095016479492 minutes
80000
Time taken to process 10000 lines 0.01797702709833781 minutes
90000
Time taken to process 10000 lines 0.015256174405415853 minutes
100000
Time taken to process 10000 lines 0.017199369271596272 minutes
110000
Time taken to process 10000 lines 0.01769241491953532 minutes
120000
Time taken to process 10000 lines 0.01642011006673177 minutes
130000
Time taken to process 10000 lines 0.0163647452990214 minutes
140000
Time taken to process 10000 lines 0.01787092685699463 minutes
150000
Time taken to process 10000 line

In [31]:
# group by state and find counts for each state. count column gives us policy coverage
grouped_count = state_year.groupby(['state', 'year']).size().reset_index(name='count')


In [124]:
grouped_count.head(2)

Unnamed: 0,state,year,count
0,ak,2011,403
1,ak,2012,232


In [33]:
grouped_count.to_excel('./Data/grouped_state_year.xlsx', index = False)

In [83]:
state_abbreviations = ['tn', 'nh', 'wi', 'md', 'sc', 'dc', 'ak', 'nm', 'ar', 'mt', 'ut',
       'sd', 'la', 'il', 'hi', 'or', 'vt', 'in', 'ok', 'ks', 'oh', 'nc',
       'ct', 'fl', 'az', 'mi', 'de', 'nd', 'pr', 'mn', 'ga', 'va', 'me',
       'wa', 'ne', 'wv', 'ny', 'al', 'nj', 'tx', 'ca', 'ky', 'wy', 'pa',
       'nv', 'ia', 'ma', 'id', 'ms', 'ri', 'mo']  

# years = range(2008, 2016)  
desired_samples_per_state_year = 10

# Initialize a dictionary to track the count of samples for each state-year pair

## different states have different years for which bill data is available. If we do not have appropriate state year pairs then we would have to loop through the entire dataframe.
## This allows us to break early

samples_count = {}
for state in state_abbreviations:
    for year in grouped_count[grouped_count.state == state].year.unique():
        samples_count[(state, str(year))] = 0
                      
samples_df = pd.DataFrame()


# Function to update our samples
def update_samples(chunk, samples_count, samples_df):
    ''' This function reads chunks of the text alignments data and samples for state year pair.
    If the required sample for any or all states is not satsfied in this chunk move to next chunk and continue sampling '''
    for state in state_abbreviations:
        for year in grouped_count[grouped_count.state == state].year.unique():
            key = (state, str(year))
            if samples_count[key]<10:
            # Filter rows based on state and year
                filtered_chunk = chunk[chunk['left_id'].str.startswith(f"{state}_{year}")]
                remaining_samples = desired_samples_per_state_year - samples_count[key]
                if remaining_samples > 0:  # Need more samples
                    sampled_rows = filtered_chunk.sample(min(len(filtered_chunk), remaining_samples), random_state=1)  # Change seed if needed
                    samples_df = pd.concat([samples_df, sampled_rows], ignore_index=True)
                    samples_count[key] += len(sampled_rows)
    return samples_count, samples_df

# File reading and sampling
chunk_size = 1000000  # Adjust based on your memory constraints
i = 0
for chunk in pd.read_csv(file_path,usecols = ['left_id','right_id', 'score','adjusted_alignment_score'], chunksize=chunk_size):
    samples_count, samples_df = update_samples(chunk, samples_count, samples_df)
    samples_df.shape
    i += 1
    print(i)
    jj = 0 
    print(samples_count)
    for m in samples_count.keys():
        if samples_count[m]<10:
            jj +=1
    print ('Less then required sample for state unique pairs remaining ' + str(jj))
    if all(count >= desired_samples_per_state_year for count in samples_count.values()):
        break  # Exit loop if we've collected enough samples for each state-year combination

In [87]:
## extract bill ids 
samples_df['left_bill_state'] = samples_df['left_id'].str[:2]
samples_df['left_bill_id'] = samples_df['left_id'].str.split('_').str[-1]
samples_df['right_bill_state'] = samples_df['right_id'].str[:2]
samples_df['right_bill_id'] = samples_df['right_id'].str.split('_').str[-1]

In [122]:
samples_df.shape

(1480, 8)

In [115]:
samples_df.to_excel('./Data/sampled.xlsx')

In [110]:
## create a dictionary of unique left and right bills and their state. We use the ouptut to read only these samples from our raw bills data when finding bills whose sponsor names are needed 
dictionary_df = samples_df[['left_bill_id','left_bill_state']].drop_duplicates(keep = 'first')
dictionary_df.reset_index(drop = True, inplace = True)
left_bill_dictionary = {}
for i in range(len(dictionary_df)):
    left_bill_dictionary[dictionary_df['left_bill_id'][i]] = dictionary_df['left_bill_state'][i]

dictionary_df2 = samples_df[['right_bill_id','right_bill_state']].drop_duplicates(keep = 'first')
dictionary_df2.reset_index(drop = True, inplace = True)
right_bill_dictionary = {}
for i in range(len(dictionary_df)):
    right_bill_dictionary[dictionary_df2['right_bill_id'][i]] = dictionary_df2['right_bill_state'][i]

In [113]:
import pickle

with open('./Data/left_dict.pkl', 'wb') as pickle_file:
    pickle.dump(left_bill_dictionary, pickle_file)

with open('./Data/right_dict.pkl', 'wb') as pickle_file:
    pickle.dump(right_bill_dictionary, pickle_file)