In [1]:
import json
import pandas as pd

In [2]:
def read_json(path):
  """
    Read JSON data from a file and parse it into a Python data structure.

    Parameters:
    - path (str): The path to the JSON file.

    Returns:
    - dict or list: A Python dictionary or list representing the parsed JSON data.

    Raises:
    - FileNotFoundError: If the specified file path is not found.
  """
  try:
    with open(path,'r') as f:
      json_data = f.read()

    json_data = json_data.replace("}{","},{")
    json_data = f"[{json_data}]"

    return json.loads(json_data)

  except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {path}")


In [3]:
def extract_employees(node):
    """
    Recursively extract employee details from a hierarchical structure.

    Parameters:
    - node (dict): A dictionary representing a node in the hierarchical structure.

    Returns:
    - list: A list of dictionaries containing employee details.

    """
    employees = []

    if 'metadata' in node:
        employees.extend(node['metadata']['employee_data']['driver_details'])

    for child_node in node.get('nodes', []):
        employees.extend(extract_employees(child_node))

    return employees

In [4]:
def extract_driver_profile(data):
    """
    Extract driver profile information from a data dictionary.

    Parameters:
    - data (dict): A dictionary containing driver profile information.

    Returns:
    - str: The full name of the driver.
    """

    driver_profile = data['driver_profile']
    full_name = driver_profile['firstName'] + ' ' + driver_profile['lastName']
    return full_name

In [5]:
def extract_driver_metrics(data):
    """
    Extract driver metrics information from a data dictionary.

    Parameters:
    - data (dict): A dictionary containing driver metrics information.

    Returns:
    - dict: A dictionary containing various driver metrics.

    """
    driver_metrics = data['driver_metrics']
    return {
        'vehicle_brand': driver_metrics['vehicle_brand'],
        'experience_group': driver_metrics['driver_experience_group'],
        'awards': driver_metrics['special_achievements_awarded'],
        'endurance_score': driver_metrics['driver_endurance_score'],
        'profitability_score': driver_metrics['driver_profitabilty_score'],
        'safety_adherence_score': driver_metrics['driver_safety_adherence_score'],
        'efficiency_score': driver_metrics['driving_efficiency_score'],
        'no_of_1_star_rating': driver_metrics['Number_of_1_star_ratings'],
        'no_of_2_star_rating': driver_metrics['Number_of_2_star_ratings'],
        'no_of_3_star_rating': driver_metrics['Number_of_3_star_ratings'],
        'no_of_4_star_rating': driver_metrics['Number_of_4_star_ratings'],
        'no_of_5_star_rating': driver_metrics['Number_of_5_star_ratings']
    }

In [6]:
def process_data(json_data):
    """
    Process JSON data containing information about taxi groups, employees, and driver metrics.

    Parameters:
    - json_data (list): A list of dictionaries representing JSON data with a specific structure.

    Returns:
    - list: A list of dictionaries containing processed information about taxi groups and drivers.

    """
    final_data = []
    for data in json_data:
        all_employees = extract_employees(data['taxi_org_data']['depot_data']['root'])
        for emp in all_employees:
            taxi_group_name = data['taxi_group_name']
            driver_full_name = extract_driver_profile(emp)
            driver_metrics = extract_driver_metrics(emp)
            disabled = emp["disabled"]
            deleted = emp["deleted"]
            extracted_data = {
                'taxi_group_name': taxi_group_name,
                'driver_full_name': driver_full_name,
                'disabled' : disabled,
                'deleted' : deleted,
                **driver_metrics  # Unpack the dictionary

            }
            final_data.append(extracted_data)

    return final_data

In [7]:
json_file_path = "data.json"
json_data = read_json(json_file_path)
final_data = process_data(json_data)

In [8]:
df = pd.DataFrame(final_data)

In [12]:
columns_to_convert = ['disabled','deleted','awards']
df[columns_to_convert] = df[columns_to_convert].apply(lambda x: x.astype(int))

In [13]:
df.head()

Unnamed: 0,taxi_group_name,driver_full_name,disabled,deleted,vehicle_brand,experience_group,awards,endurance_score,profitability_score,safety_adherence_score,efficiency_score,no_of_1_star_rating,no_of_2_star_rating,no_of_3_star_rating,no_of_4_star_rating,no_of_5_star_rating
0,"Walsh, Hammond and Craig",Bruce Marshall,0,1,7,6,0,0.666398,-0.252875,0.279009,1.89221,9,38,40,20,22
1,"Walsh, Hammond and Craig",Carol Holmes,0,0,9,4,0,-1.17785,-1.56046,-1.21152,-0.510155,8,16,28,43,39
2,"Walsh, Hammond and Craig",Jay Bailey,0,0,16,6,0,-0.557925,0.0833,0.596134,0.257793,5,12,25,39,26
3,"Walsh, Hammond and Craig",Aimee Smith,0,1,3,4,0,-0.787186,-0.484124,0.966867,0.54889,8,8,16,52,34
4,"Walsh, Hammond and Craig",Hayley Hall,0,0,14,1,0,0.518161,-0.108167,-0.025077,-0.133834,22,32,53,16,13


In [14]:
df.to_csv("extracted_data.csv", header=True,index=False)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44061 entries, 0 to 44060
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   taxi_group_name         44061 non-null  object 
 1   driver_full_name        44061 non-null  object 
 2   disabled                44061 non-null  int32  
 3   deleted                 44061 non-null  int32  
 4   vehicle_brand           44061 non-null  object 
 5   experience_group        44061 non-null  object 
 6   awards                  44061 non-null  int32  
 7   endurance_score         44061 non-null  float64
 8   profitability_score     44061 non-null  float64
 9   safety_adherence_score  44061 non-null  float64
 10  efficiency_score        44061 non-null  float64
 11  no_of_1_star_rating     44061 non-null  int64  
 12  no_of_2_star_rating     44061 non-null  int64  
 13  no_of_3_star_rating     44061 non-null  int64  
 14  no_of_4_star_rating     44061 non-null

In [16]:
df.describe()

Unnamed: 0,disabled,deleted,awards,endurance_score,profitability_score,safety_adherence_score,efficiency_score,no_of_1_star_rating,no_of_2_star_rating,no_of_3_star_rating,no_of_4_star_rating,no_of_5_star_rating
count,44061.0,44061.0,44061.0,44061.0,44061.0,44061.0,44061.0,44061.0,44061.0,44061.0,44061.0,44061.0
mean,0.024943,0.048456,0.083271,0.004851,0.003015,0.000761,0.003057,10.46506,20.905177,31.664442,31.701754,31.617417
std,0.155952,0.214729,0.276294,1.002364,0.99918,1.001139,0.998935,6.691607,11.898157,17.204885,17.34335,16.913903
min,0.0,0.0,0.0,-2.75623,-2.38045,-2.30958,-2.27803,-4.0,-4.0,-4.0,-4.0,-4.0
25%,0.0,0.0,0.0,-0.689989,-0.731689,-0.719151,-0.762752,5.0,11.0,18.0,17.0,18.0
50%,0.0,0.0,0.0,-0.109438,-0.091019,-0.15455,-0.07587,10.0,20.0,32.0,30.0,30.0
75%,0.0,0.0,0.0,0.487389,0.603105,0.536859,0.750876,15.0,30.0,45.0,46.0,45.0
max,1.0,1.0,1.0,7.38001,3.52134,5.10937,3.94814,34.0,56.0,78.0,77.0,81.0
