# Challenge 8 Engine file

Ensure this file is in the same directory as Openstates_API_Autocaller.ipynb!

In [27]:
#Setup packages
import requests
import pandas as pd
from pandas import json_normalize
import json
import os
import time
from datetime import datetime
from dateutil.parser import parse 

In [28]:
'''
This is the main API call for getting legislation data from openstates. See link 
'https://v3.openstates.org/docs#/bills/bills_search_bills_get' for more information on adding parameters.
The funtion takes in a state abbreviation, sponsor/people id, a session year, opt URL, and apikey from the first code block.

Notice: Votes and sponsorships are included params. Related_bills seems to be empty for most states.
These are placed in row level data as dictionaries that can be sorted further.
'''
def get_legislation_data(state, page_num, api, people_id = '', session_yr = '', url="https://v3.openstates.org/bills"):
    #people_id = 'ocd-person/9877dd5b-c3ee-459f-bb8d-d9d954ae343c'
    params = {'apikey': api, 
              'jurisdiction': state, 
              #'sponsor': people_id, #used for influencer stories
              'sort': 'latest_action_desc',
              'page': page_num, 
              'per_page':20, #limit to 20 per page
              #'session' : session_yr, 
              'include':['votes', 'sponsorships', 'related_bills']}
    bills = requests.get(url, params)
    
    data = bills.json()
    
    
    try:
        # Try to access the value corresponding to the key in the dictionary
        bill_results = data['results']
        page_details = data["pagination"]
        
        return bill_results, page_details
        
    except KeyError:
        # If KeyError occurs, handle it here
        print("KeyError: Key results not found in dictionary")
    
        
    

In [29]:
'''
This simple function is used to take in a state abbreviation and return a hard-coded value for 
the states current political party information as of 2021 data. Would recommend finding an api source
for this to maintain updated info.

Purpose: to make a column with state party info to assist the model in determining
what bill subjects are likely or less likely to pass.
'''
def state_party_finder(state):
    try:
        state_parties = {
            "AL": "R", 
            "AK": "R", #works well but 124 pages
            "AZ": "D",
            "AR": "R",
            "CA": "D",
            "CO": "D",
            "CT": "D",
            "DE": "D",
            "FL": "R",
            "GA": "R",
            "HI": "D",
            "ID": "R",
            "IL": "D",
            "IN": "R",
            "IA": "R",
            "KS": "R",
            "KY": "R",
            "LA": "R",
            "ME": "D",
            "MD": "D",
            "MA": "D",
            "MI": "D",
            "MN": "D",
            "MS": "R",
            "MO": "R",
            "MT": "R",
            "NE": "R",
            "NV": "D",
            "NH": "D",
            "NJ": "D",
            "NM": "D",
            "NY": "D",
            "NC": "R",
            "ND": "R",
            "OH": "R",
            "OK": "R",
            "OR": "D",
            "PA": "D",
            "RI": "D",
            "SC": "R",
            "SD": "R",
            "TN": "R",
            "TX": "R",
            "UT": "R",
            "VT": "D",
            "VA": "D",
            "WA": "D",
            "WV": "R",
            "WI": "D",
            "WY": "R"
        }

        party = state_parties[state.upper()]
        return party
    except:
        print('Please correct state abbreviation!')
        
    

In [30]:
'''
this function takes in a list of years, a list of state abbreviations, 
and a sponsor (person_id). It returns a concat dataframe of the associated bills 
within that year list range sponsored by the chosen person_id in the specified 
state/jurisdiction.
'''
def data_framer(api, state_list, page_start=1, page_end=10, people_id=''):
    empty_df = pd.DataFrame()
    #collect per year/state, provide first page
    for state in state_list:
        print('Grabbing data for state: ' + str(state))
        try: #catch if empty result and return empty dataframe
            #return state party affiliation
            state_party = state_party_finder(state)
            
            bills_json, page_info = get_legislation_data(state, 1, api)

            #collect per year/state
            max_page = page_info['max_page']
            
            print('Max pages for state ' + str(state) + ': '+ str(max_page))
            
        except TypeError: #investigate further 4/7/23
            print('bill results empty, type error')
            
        #loop through all pages and concat #changed from max_page to page_end
        for page in range(page_start, page_end):
            #handler for when user has selected more pages than available
            
            try:
                print('Page results: ' + str(page))
                try:
                    bills_json, page_info = get_legislation_data(state, page, api)
                    cur_page = page_info['page']
                    #collect per year/state
                    max_page = page_info['max_page']
                    #moved 4/8/23
                    if page >= max_page:
                        print('max page reached! ending call')
                        break
                    
                    bills_df = pd.json_normalize(bills_json)
                    
                    empty_df = pd.concat([empty_df, bills_df], ignore_index = True)  
                    #add column with state party affiliation 4/7/23
                    empty_df['state_party_affiliation'] = state_party
                except TypeError:
                    print('bill results empty, type error')
                    #return empty_df
            except KeyError:
                #print when api limit is reached @250 calls per day
                print("Key results not found in dictionary, last page: " + str(page))
            
    return empty_df    

In [31]:
'''
Due to the nature of the assignment, 
we want to remove any rows that have blank voting info as well as blank bill subject.

'''
def filter_df(data, col_name):   
    try:
        df = data.copy()
        return df.drop(df[df[col_name].apply(lambda x: len(x)==0)].index)
    except (KeyError, AttributeError):
        print('API Call limits Exceeded!')
        return pd.DataFrame()


In [32]:
def lists_to_dataframe(*lists, columns=None):
    """
    Convert one or more lists to a pandas DataFrame.
    :param lists: one or more lists to convert
    :param columns: list of column names for the DataFrame (optional)
    :return: a pandas DataFrame
    """
    data = {}
    for i, lst in enumerate(lists):
        data[f"list_{i+1}"] = lst
    
    df = pd.DataFrame(data)
    
    if columns is not None:
        df.columns = columns
    
    return df

In [33]:
from dateutil.parser import parse
'''
This function takes in a bill indicie and 
returns the most recent vote instance that took place for the bill.

It takes in the main legi_data dataframe and a specific bill indicie 
and returns the vote instance indicie that is most recent.
'''
def vote_inst_sorter(data, bill_int):

    df = data.copy()
    
    #lists for storage
    date_list = []
    result_list = []

    #iterate each vote instance for passed bill integer
    for vote_inst in range(0, len(df.iloc[bill_int]['votes'])):

        date_list.append(df.iloc[bill_int]['votes'][vote_inst]['start_date'])

    #dates = [datetime.strptime(date_str, "%Y-%m-%d") for date_str in date_list]
    #dates = [datetime.strptime(date_str.split("-")[0], "%Y-%m-%dT%H:%M:%S") for date_str in date_list]
    dates = [parse(date_str) for date_str in date_list]
    
    sorted_dates = sorted(dates, reverse=True)
    
    #grab the latest/most recent vote instance
    latest = str(sorted_dates[0])[0:10] 

    #run through again to find ID for latest instance to return
    for vote_inst in range(0, len(df.iloc[bill_int]['votes'])):
        
        #obtain start date per instance
        start_date = df.iloc[bill_int]['votes'][vote_inst]['start_date']
        
        if start_date == latest:
            
            vote_id = df.iloc[bill_int]['votes'][vote_inst]['id']
            latest_result = df.iloc[bill_int]['votes'][vote_inst]['result']
            
            return vote_inst, latest_result
        

In [34]:
'''
Creates columns: 'vote_info', 'vote_counts', 'result'
1. gets a list of legislators that support and list that oppose
2. uses set() to get name list to avoid duplicate legislators
3. grabs only most recent vote instance of the bill to determine pass/fail result from 
- see vote_inst_sorter for sorting mechanism.     
'''
def unpack_votes(data):
    
    df = data.copy()
    
    dict_list = []
    cnt_list = []
    result_list = []
    

    #iterate over each bill
    for bill in range(0, len(df['votes'])):
        
        
        #pass bill to sorter to get latest vote instance indicie
        vote_inst_ind, latest_vote_result = vote_inst_sorter(df, bill)
        
        #vote result 0/1 conversion for better machine learning 
        
        if latest_vote_result == 'pass':
            res = 1
        elif latest_vote_result == 'fail':
            res = 0
        else:
            res = np.nan
        
        #save latest vote numeric outcome
        result_list.append(res)
        
        #counters for vote count info
        cnt_dict = {'yes': 0, 'no': 0, 'other': 0}
        
        cnt_dict['yes'] = df.iloc[bill]['votes'][vote_inst_ind]['counts'][0]['value'] #yes
        cnt_dict['no'] = df.iloc[bill]['votes'][vote_inst_ind]['counts'][1]['value'] #no
        cnt_dict['other'] = df.iloc[bill]['votes'][vote_inst_ind]['counts'][2]['value'] #other
        #append vote counts to list
        cnt_list.append(cnt_dict)
        
        #reset per bill
        vote_dict = {'support': set(), 'oppose': set(), 'other': set()}
        
        #if votes info exists, extract voter names and categorize
        if len(df.iloc[bill]['votes'][vote_inst_ind]['votes']) != 0:
            #print('State has additional voter information! Extracting...')
            #iterate over each voter per vote instance ## len(df.iloc[bill]['votes'][vote_inst_ind]['votes'])
            for voter in range(0, len(df.iloc[bill]['votes'][vote_inst_ind])):
                
                #extract vote for voter
                vote = df.iloc[bill]['votes'][vote_inst_ind]['votes'][voter]['option']
                #extract voter name
                voter_name = df.iloc[bill]['votes'][vote_inst_ind]['votes'][voter]['voter_name']

                #voter logic
                if vote == 'yes':
                    #cnt_dict['yes'] += 1
                    vote_dict['support'].add(voter_name)

                elif vote == 'no':
                    #cnt_dict['no'] += 1
                    vote_dict['oppose'].add(voter_name)

                else:
                    #cnt_dict['other'] += 1
                    vote_dict['other'].add(voter_name)

            # append list of names
            dict_list.append(vote_dict)
            
        else:
            #added 4/6/23 testing if logic for scenario when state has no voter names section
            dict_list.append(vote_dict)

            #print('No voter info found in json "votes" section')
        
    #runs outside of all for loops
    df['vote_info'] = dict_list
    df['vote_counts'] = cnt_list
    df['result'] = result_list
    
    return df


In [35]:
# Define a function to extract information from the lists and return a new dataframe
def extract_info(row):
    # Get the length of the list
    list_length = len(row['subject'])
    
    # Create a dictionary to hold the new column names and values
    new_cols = {'id': row['id']}
    for i in range(list_length):
        new_cols[f'bill_subject_{i+1}'] = row['subject'][i]
    
    # Return a new dataframe with the new columns
    return pd.DataFrame(new_cols, index=[0])

In [36]:
'''
Creates column: bill_party_affiliation
this function takes in the json to pd converted dataframe and searches the 'sponsorship' column for bill sponsors. 
the sponsors are then counted by party affiliation based on keyword and returned as a 0 1. 
0 = Rep, 1 = Dem.

make this loop through entire df each row and return party affiliation
'''
def party_counter(df):
     
    #iterate over list of bills
        PA_list = [] #the Party Affiliation List
        for bill in range(0, len(df)):#not good enough
            try:

                
                rep = 0
                dem = 0
                #iterate over list of sponsors for bill
                sponsor_count = len(df.iloc[bill]['sponsorships'])
                for i in range(sponsor_count):

                    party_affiliation = df.iloc[bill]['sponsorships'][i]['person']['party']

                    if party_affiliation == 'Republican':
                        rep += 1
                    else: 
                        dem += 1

                #determine which party had more bill support
                #this runs each bill
                if rep > dem:
                    PA_list.append('R')
                elif dem > rep: 
                    PA_list.append('D')
                else:
                    PA_list.append('U') #tie situation, uni support
                #print bill party affiliation information. for testing...
                #print('bill # ' + str(bill) + ' ' +  str({'Republicans': rep, 'Democrats': dem}))
        
            except KeyError:
                 #make this have value added to PA list
                #print("Key person not found in dictionary")
                PA_list.append('NA')
            
        #create new column to hold list values   
        df['bill_party_affiliation'] = PA_list

        return df

In [37]:
#exporting as json to avoid string character errors that come up when exporting to csv
def export_json(data, state_list, p_start, p_end):
    #export the data
    file_name = 'legi_data_' + str(state_list) + '_sample'

    path = ('data\\'+ file_name +  '_' + str(p_start) + '-' + str(p_end) + '.json')
    
    data.to_json(path, orient='records')

In [38]:
'''
This function combines the above functions into one for ease of api calling. 
However, this approach will use many api calls. 

Provide this function your apikey from openstates, a state name,
a start page starting at 10 minimum, and a number of times you want to 
have this function grab the next 10 pages. The time pause between calls
is mandatory by openstates api free license calls and is necessary for the program to run.

Files are exported as json and not csv due to formatting issues between columns that break csv export.
I used dataiku to read in the json files easily and combine into one dataset to test and export a csv from.
'''
def run_daily_api_limit(api, state_list, p_start, p_end, call_count=5):
    for i in range(0, call_count):
        
        print('pausing for API required 60 seconds to avoid restrictions')
        time.sleep(65)
        print( 'Run: ' + str(i+1) + ' of ' + str(call_count))
        
        
        df = data_framer(api, state_list, page_start = p_start, page_end = p_end)
        print('Bills before filtering for vote/subject info: '  +  str(len(df)))
        #filter out blank vote rows
        df_filtered_votes = filter_df(df, 'votes')
        print('results after filtering votes: ' + str(len(df_filtered_votes)))
        #filter out blank bill subject rows
        df_filtered_all = filter_df(df_filtered_votes, 'subject')
        print('Bills remaining after filtering for vote/subject info: '  +  str(len(df_filtered_all)))
        
        #for when 10 pages of 20 results each have no subject/vote info
        zero_check = len(df_filtered_all)
        #if subject/vote filtering returned no bills
        if zero_check != 0:
            try: #error here means no rows returned after filtering
                #print the value counts of results column
                #print('Bill Vote results: ' + str(df_filtered_all['results'].value_counts()) )
                print('Unpacking Vote Information')   
                unpacked_votes_df = unpack_votes(df_filtered_all)
                print('Unpacking Subject Information')       
                # Apply the function to the dataframe to get a new dataframe with the list contents separated into new columns
                unpacked_subject_df = pd.concat([extract_info(row) for _, row in unpacked_votes_df.iterrows()], ignore_index=True)
                print('Merging dataframes')
                merged_df = unpacked_votes_df.merge(unpacked_subject_df, on='id')
                print('Determining Bill Party Affiliations')
                final_legi_data = party_counter(merged_df)
                print('Exporting Data')

                export_json(final_legi_data, state_list, p_start, p_end)
                print('Exported file: ' + str(i+1) + ' of ' + str(call_count))
                #Grab next 10 pages
                p_start += 10
                p_end += 10
            except (TypeError, KeyError):
                print('No rows returned after filtering...')
                print('Bill Vote results: ' + str(df_filtered_all['results'].value_counts()) )
                print('Unpacking Vote Information...')   
                unpacked_votes_df = unpack_votes(df_filtered_all)
                print('Unpacking Subject Information...')       
                # Apply the function to the dataframe to get a new dataframe with the list contents separated into new columns
                unpacked_subject_df = pd.concat([extract_info(row) for _, row in unpacked_votes_df.iterrows()], ignore_index=True)
                print('Merging dataframes...')
                merged_df = unpacked_votes_df.merge(unpacked_subject_df, on='id')
                print('Determining Bill Party Affiliations...')
                final_legi_data = party_counter(merged_df)
                print('Empty Data exported!')

                export_json(final_legi_data, state_list, p_start, p_end)
                #Grab next 10 pages
                p_start += 10
                p_end += 10
                print('API call limits Exceeded')
        else: 
            print('No vote or subject information after filtering bills, moving to next 10 pages...')
            #Grab next 10 pages
            p_start += 10
            p_end += 10
            continue