# Taxi Data Processing
## Overview
This notebook outlines the process of extracting and processing taxi data from JSON files using Python. The data includes information about taxi groups, driver metrics, driver details and other relevant details. The notebook demonstrates how to extract data from JSON files, process it to derive meaningful insights, and create a structured DataFrame for further analysis.

## Description
The notebook includes functions to extract taxi groups from JSON files, process the extracted data to extract relevant information, and create a pandas DataFrame for easy manipulation and analysis. It covers various aspects such as data extraction, data processing, and DataFrame creation, providing a comprehensive guide to working with taxi data in Python.

### Imports

In [1]:
import json
import pandas as pd

### Functions

In [9]:
def extract_data(input, taxi_group_name):
    """
    Extracts data from a nested JSON input and formats it into a list of rows.

    Parameters:
        input (dict): Nested JSON input containing metadata and employee data.
        taxi_group_name (str): Name of the taxi group associated with the data.

    Returns:
        list: List of rows, where each row is a list containing extracted data.

    Notes:
        - The function extracts specific fields from the input JSON and formats them into rows.
        - It iterates through each driver's data and extracts relevant metrics and profile information.
        - The extracted data is formatted into rows and appended to the result list.
        - If the input contains nested nodes, the function recursively processes them to extract data.

    """
    result = []

    # taxi division rank
    rank = input["metadata"]["rank"]

    drivers = input["metadata"]["employee_data"]["driver_details"]

    for driver in drivers:
        row = []

        # Add required columns (frames)

        # HEADER

        # taxi_group_name
        row.append(taxi_group_name)

        # rank
        row.append(rank)

        # DRIVER PROFILE

        # first_name
        row.append(driver["driver_profile"]["firstName"])

        # last_name
        row.append(driver["driver_profile"]["lastName"])

        # DRIVER METRICS

        # vehicle_brand
        row.append(driver["driver_metrics"]["vehicle_brand"])

        # driver_experience_group
        row.append(driver["driver_metrics"]["driver_experience_group"])

        # special_achievements_awarded
        row.append(driver["driver_metrics"]["special_achievements_awarded"])

        # driver_endurance_score
        row.append(driver["driver_metrics"]["driver_endurance_score"])

        # driver_profitability_score
        row.append(driver["driver_metrics"]["driver_profitabilty_score"])

        # driver_safety_adherence_score
        row.append(driver["driver_metrics"]["driver_safety_adherence_score"])

        # driving_efficiency_score
        row.append(driver["driver_metrics"]["driving_efficiency_score"])

        # number_of_1_star_ratings
        row.append(driver["driver_metrics"]["Number_of_1_star_ratings"])

        # number_of_2_star_ratings
        row.append(driver["driver_metrics"]["Number_of_2_star_ratings"])

        # number_of_3_star_ratings
        row.append(driver["driver_metrics"]["Number_of_3_star_ratings"])

        # number_of_4_star_ratings
        row.append(driver["driver_metrics"]["Number_of_4_star_ratings"])

        # number_of_5_star_ratings
        row.append(driver["driver_metrics"]["Number_of_5_star_ratings"])

        # EXTRA COLUMNS

        # driver_id
        row.append(driver["driver_id"])

        # disabled
        row.append(driver["disabled"])

        # deleted
        row.append(driver["deleted"])

        result.append(row)

    # Iteratively processes nodes identified in the input
    if "nodes" in input:
        for node in input["nodes"]:
            subresults = extract_data(node, taxi_group_name)

            # Ensure a flat list
            for subresult in subresults:
                result.append(subresult)
    else:
        print("No nodes in input")

    #print(("Data extraction completed."))
    return result

In [10]:
def extract_taxi_groups(file_path_copy):
    """
    Extracts taxi groups from a JSON file.

    Parameters:
        file_path_copy (str): The path to the JSON file.

    Returns:
        list: List of taxi groups extracted from the JSON file.
        None: If an error occurs during file reading or JSON parsing.

    """
    taxi_groups = []

    try:
        with open(file_path_copy, 'r') as f:
            data = json.load(f)

            for taxi_group in data:
                taxi_groups.append(taxi_group)

        return taxi_groups

    except FileNotFoundError:
        print(f"File '{file_path_copy}' not found.")
        return None

    except json.JSONDecodeError:
        print(f"Error decoding JSON from file '{file_path_copy}'.")
        return None

In [11]:
def process_taxi_groups(taxi_groups):
    """
    Process taxi groups and extract data from each group.

    Parameters:
        taxi_groups (list): List of taxi groups extracted from a JSON file.

    Returns:
        list: List of extracted data rows.

    """
    rows = []
    result = []

    for taxi_group in taxi_groups:
        columns = []
        columns.append(taxi_group["taxi_group_name"])
        columns.append(taxi_group["taxi_org_data"]["depot_data"])
        rows.append(columns)

    for row in rows:
        extracts = extract_data(row[1]["root"], row[0])

        if extracts is not None:
            for extract in extracts:
                result.append(extract)
        else:
            print(f"Error processing taxi group '{row[0]}'")

    return result

In [12]:
def create_dataframe(result, column_names):
    """
    Create a pandas DataFrame from extracted data and column names.

    Parameters:
        result (list): List of extracted data rows.
        column_names (list): List of column names for the DataFrame.

    Returns:
        pandas.DataFrame: DataFrame containing the extracted data.

    """
    return pd.DataFrame(result, columns=column_names)

### Execution

In [17]:
# Define file paths
file_path_copy = "data_copy.json"

# Extract taxi groups from JSON file
taxi_groups = extract_taxi_groups(file_path_copy)

if taxi_groups:
    # Process taxi groups and extract data
    result = process_taxi_groups(taxi_groups)

    # Define column names
    column_names = [
        'taxi_group_name',
        'rank',
        'first_name',
        'last_name',
        'vehicle_brand',
        'driver_experience_group',
        'special_achievements_awarded',
        'driver_endurance_score',
        'driver_profitabilty_score',
        'driver_safety_adherence_score',
        'driving_efficiency_score',
        'number_of_1_star_ratings',
        'number_of_2_star_ratings',
        'number_of_3_star_ratings',
        'number_of_4_star_ratings',
        'number_of_5_star_ratings',
        'driver_id',
        'disabled',
        'deleted'
    ]

    # Create DataFrame
    df = create_dataframe(result, column_names)

    # Display DataFrame
    #print(df.head())
else:
    print("Error extracting taxi groups.")

In [18]:
df.head()

Unnamed: 0,taxi_group_name,rank,first_name,last_name,vehicle_brand,driver_experience_group,special_achievements_awarded,driver_endurance_score,driver_profitabilty_score,driver_safety_adherence_score,driving_efficiency_score,number_of_1_star_ratings,number_of_2_star_ratings,number_of_3_star_ratings,number_of_4_star_ratings,number_of_5_star_ratings,driver_id,disabled,deleted
0,"Walsh, Hammond and Craig",0,Bruce,Marshall,7,6,False,0.666398,-0.252875,0.279009,1.89221,9,38,40,20,22,f6e4b169-b46a-42e2-a867-aeb5a3504606,False,True
1,"Walsh, Hammond and Craig",1,Carol,Holmes,9,4,False,-1.17785,-1.56046,-1.21152,-0.510155,8,16,28,43,39,0dc2f8f6-cfc5-4b33-be82-9dbe1971186a,False,False
2,"Walsh, Hammond and Craig",1,Jay,Bailey,16,6,False,-0.557925,0.0833,0.596134,0.257793,5,12,25,39,26,88ce78cc-5843-48b7-8b2e-9b003060dbdd,False,False
3,"Walsh, Hammond and Craig",1,Aimee,Smith,3,4,False,-0.787186,-0.484124,0.966867,0.54889,8,8,16,52,34,968b5638-cea1-4f92-b926-2b19da8edbbf,False,True
4,"Walsh, Hammond and Craig",1,Hayley,Hall,14,1,False,0.518161,-0.108167,-0.025077,-0.133834,22,32,53,16,13,c0b889e0-faba-44dd-bf64-296b273e1e77,False,False


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44061 entries, 0 to 44060
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   taxi_group_name                44061 non-null  object 
 1   rank                           44061 non-null  int64  
 2   first_name                     44061 non-null  object 
 3   last_name                      44061 non-null  object 
 4   vehicle_brand                  44061 non-null  object 
 5   driver_experience_group        44061 non-null  object 
 6   special_achievements_awarded   44061 non-null  bool   
 7   driver_endurance_score         44061 non-null  float64
 8   driver_profitabilty_score      44061 non-null  float64
 9   driver_safety_adherence_score  44061 non-null  float64
 10  driving_efficiency_score       44061 non-null  float64
 11  number_of_1_star_ratings       44061 non-null  int64  
 12  number_of_2_star_ratings       44061 non-null 