In [None]:
import sys; sys.path.insert(0,'..')

In [None]:
import datetime
import glob
import json
import os

import numpy as np
import pandas as pd

In [None]:
from flight_tables.heathrow_parsing import extract_batch_heathrow
from flight_tables.flight_parsing import ParsedFlights

# Setup

In [None]:
file_pattern = '2020*Z.json' #'2020-01-29Z.json'
root_dir = os.path.dirname(os.getcwd()) # parent of cwd
data_dir = os.path.join(root_dir, 'data\\heathrow_data\\')
path_pattern = os.path.join(data_dir, file_pattern)

In [None]:
# Files to Load
files = glob.glob(path_pattern) # A List of file paths
print(f"{len(files)} files found")

# Load Batch

In [None]:
def file_to_df(path):
    """Load a Heathrow Flights JSON into a Dataframe.
    
    Parameters:
        path (str): path of JSON file you want to load.
    Returns: 
        heathrow_df (pd.DataFrame): Dataframe created after parsing the raw file.
    """    
    with open(path, 'r') as f:
        heathrow_raw_dict = json.load(f)

    batch_info = extract_batch_heathrow(heathrow_raw_dict)

    parsed_flights = ParsedFlights(batch_info)

    heathrow_df = parsed_flights.to_dataframe()
    
    return heathrow_df

In [None]:
def batch_load_heathrow_json(file_paths):
    """Load a list of heathrow JSON files into a single DataFrame.
    
    Parameters (list): File path strings of the JSON files you want to load
    Returns (pd.DataFrame): Single DataFrame with all the files you loaded.
    """
    
    
    flight_dataframes = []

    for file in files:
        temp_df = file_to_df(file)
        flight_dataframes.append(temp_df)

    df = pd.concat(flight_dataframes)
    
    return df

In [None]:
# Takes some time if loading many files... (consider adding progress bar)
df = batch_load_heathrow_json(files)

# Pickling
After the first time you can save jsons as dataframe pickle and load from there.

In [None]:
#pd.to_pickle(df, './all_flights.pkl') # Saves to cwd (Notebooks directory)

In [None]:
df = pd.read_pickle('all_flights.pkl')

# Dataframe Preparation
* Drop Duplicates
* Drop Flights which are not the primary Flight ID

In [None]:
# Delete Duplicates
df.drop_duplicates(inplace=True) 

In [None]:
#Duplicate Analysis:
#df.loc[df.duplicated()] # Show duplicates
#df.loc[(df.delay_mins==16) & (df.flight_id=='BR068')] # Find specific Duplicates
#assert df.duplicated().any()==False, "Duplicated Entries found in Table."

In [None]:
# Drop Alternative flight IDs
df = df.loc[df.code_share != 'alt_code']

### Validation

In [None]:
# Check number of dates matches number of files
dates_count = len(df.scheduled_datetime.dt.date.unique())
assert dates_count == len(files), \
    f"Number of files doesn't match number of dates. \n\t You have {len(files)} files but data for {dates_count} dates."

# Analysis