### FIPS Data Extraction

Extraction of the FIPS data from the United States Department Of Transportation API.

In [5]:
# Data Extraction - FIPS
# Version 8
## 1/26/25
## 19:10

In [6]:
# Import System Module 
import sys

# Import OS Module
import os

# Add the root directory to sys.path
sys.path.append(os.path.abspath(".."))

# Now you can use absolute imports
from d497_helpers import checkpoint_helper as CheckPoint, config

# Pandas
import pandas as pd

# Socrata Sodapy
from sodapy import Socrata


In [7]:
# Main Function
# According the API's documentation there is a limited of 200o items that can be capture during each request. 
# Main function allows performs the loop request of the API iteration every 2000 items and saves all to a single dataframe. 


def main():
    
    client = Socrata("data.transportation.gov", config.global_dot_api_key)
    
    # Example authenticated client (needed for non-public datasets):
    # client = Socrata(data.transportation.gov,
    #                  MyAppToken,
    #                  username="user@example.com",
    #                  password="AFakePassword")
    
    #client = Socrata("data.transportation.gov",
                    #"G9y46iRQiv4jix5v8fCATdd2B",
                    #username="dantydcook@gmail.com",
                    #password="_wv:bpe$J4MZy_'")
    
    # First 2000 results, returned as JSON from API / converted to Python list of
    # dictionaries by sodapy.
    
    record_count_result = client.get("eek5-pv8d", select="count(*)")
    
    run_results = client.get("eek5-pv8d", limit=2000)
    
    # Returns a list of dictionary items
    #print(record_count_result)
    
    # Returns the value. Must Index into List then get value of the key. 
    #print(f'Total Records: {record_count_result[0]["count"]}')
    
    # Assigning Total_row_count with an integer return of the count dictionary value from the get request. 
    total_row_count = int(record_count_result[0]["count"])
    
    # Assigning Limit
    limit = 2000
    
    # Total Loops is total_row_count int divided by limit
    total_loops = (total_row_count // limit) 
    
    # If check for modulo of total_loops / limit
    if total_loops % limit != 0:
        total_loops += 1
    
    # print total loops
    #print(f"Total Loops: {total_loops}\n")
    
    # Convert to pandas DataFrame
    run_results_df = pd.DataFrame.from_records(run_results)
    
    # Print DF
    #run_results_df
    
    # Loop counter Default Assignment
    loop_counter = 0
    
    # Offset Index
    offset_index = 0
    
    # results main dataframe
    export_start_index = 0
    
    #column_names = client.get("eek5-pv8d", limit="1")
    metadata = client.get_metadata("eek5-pv8d")
    
    #print(column_names)
    #print(metadata)

    # Creating blank list called "column_names" to store the names for each column
    column_names = []

    # Creating a blank list called "column_datatypes" to store the datatype values for each column
    column_datatypes= []

    # Creating a blank dictionary called "dataset_scheme" to store the the column name with the associated column datatype as a dictionary file. 
    dataset_schema = {}

    # for loop to get the capture the column name and it's associated data type and append them to their respective lists. 
    for column in metadata['columns']:
    
        #print(column['fieldName'])
        column_names.append(column['fieldName'])
    
        if column['dataTypeName'] == "text":
            column_datatypes.append("str")

    # Assigning the dataset_schema dictionary with a column names and column datatypes lists by using the zip and dict functions. 
    dataset_schema = dict(zip(column_names, column_datatypes))
    
    #print(dataset_schema)

    # Creating a dataframe 
    main_results_df = pd.DataFrame(columns=dataset_schema).astype(dataset_schema)

    # export end index counter
    export_end_index = 0

    # while loop
    while loop_counter <= total_loops:
    
        #print(f"loop started: {loop_counter}")
    
        #print("client get")
        loop_run_results = client.get("eek5-pv8d", limit="2000", offset=offset_index)
        #print(f"offset {offset_index}")
    
        #print("if-else")
        if main_results_df.empty:
            #print("if entered")
            main_results_df = pd.DataFrame.from_records(loop_run_results)
        else:
            #print("else entered")
            loop_iteration_results = pd.DataFrame.from_records(loop_run_results)
            main_results_df = pd.concat([main_results_df, loop_iteration_results])
    
        #print(f"export end index prior update {export_end_index}")
        export_end_index = export_start_index + (limit * (loop_counter + 1))
        #print(f"export end index post update {export_end_index}")
        
        offset_index = export_end_index
        #print(f"Offset_index: {offset_index}")
        
        loop_counter += 1
        #print(f"loop ends, coutner increased {loop_counter}")
    
    main_results_df
    
    #file_name = "raw_fips_master_data"
    
    #main_results_df.to_csv(raw_FIPS_Data_Downloads_Folder_Path + "/" + file_name + ".csv", index=False)

    CheckPoint.create_checkpoint("raw_fips_data", "raw_fips_master_data", main_results_df)


In [8]:
if __name__ == "__main__":
    main()

## [Next Step: UFO Data Extraction](data_extraction_ufo.ipynb)
---
#### [Return To Landing Page](order_of_operations_landing.ipynb)