In [77]:
import requests
import json
import re
# import pandas as pd
from collections import defaultdict, deque
from datetime import datetime

In [104]:
from datetime import datetime, timedelta, date
from time import sleep
from random import randrange
from pathlib import Path
from fast_flights import FlightData, Passengers, create_filter, get_flights, FlightsAPIResult

MAX_WEEKS = 20

search_pairs = [
    ['LAS', 'LAX'],
    ['SEA', 'YYZ'],
    ['NYC', 'TYO'],
    ['SAN', 'EZE'],
    ['SNA', 'BOS'],
]

def random_datestr() -> str:
    rand = randrange(1, MAX_WEEKS)
    return (date.today() + timedelta(days=round(rand * 7))).strftime('%Y-%m-%d')

def random_daterange(days_between: int = 0) -> tuple[date, date]:
    rand = randrange(1, MAX_WEEKS)
    start = date.today() + timedelta(days=round(rand * 7))
    return start, start + timedelta(days=days_between)

common_filter_kwargs = {
    'trip': 'one-way',
    'seat': 'economy',
    'passengers': Passengers(
        adults=1,
        children=0,
        infants_in_seat=0,
        infants_on_lap=0
    ),
}

# Create a new filter
filters = [create_filter(flight_data=[FlightData(date=random_datestr(), from_airport=from_, to_airport=to)], **common_filter_kwargs)
           for from_, to in search_pairs]


In [2]:
# Get flights with a filter
results = []
for filter_ in filters:
    print('requesting flight')
    results.append(get_flights(filter_))
    sleep(randrange(8, 20))

# write to json file if success
Path('results.json').write_text(json.dumps([result.raw_data.raw_data for result in results]))

requesting flight
requesting flight
requesting flight
requesting flight
requesting flight


We want to find real list types in this data schema (as opposed to a packed struct encoded as a list). 

In order to do so, we can scan multiple instances of the data, and track the size of each list we encounter (using the path/indices). When the size of a list is not constant, we know that the data type of that index is a list type rather than a struct packed as a list.

The length of a struct packed as a list should never change because the struct is well defined.

In [115]:
values = defaultdict(lambda: set())
num_elements = defaultdict(lambda: set())

# determine which keys refer to list types
def find_lists(data: list, prefix: str = '') -> bool:
    for i, value in enumerate(data):
        key_str = f'{prefix}{i}'
        if not isinstance(value, list):
            values[key_str].add(value)
            continue
        num_elements[key_str].add(len(value))
        find_lists(value, prefix=f'{key_str},')

# scan all results to populate values and num_elements
for result in results:
    find_lists(result.raw_data.raw_data)

# sort so we go in breadth first order
all_list_keys = sorted(num_elements.keys(), key=lambda index: len(index))

# num_elements with keys fixed to denote real arrays, an index of i is used where the previous index is referring to a real array type
fixed_num_elements = defaultdict(lambda: set())

# set of keys that denote real arrays
real_list_keys = []

for list_key in all_list_keys:
    # fix the list key by substituting array indices with i, IE: (0,2,0), (0,2,1), (0,2,2) -> (0,2,i)
    fixed_list_key = list_key
    for real_list_key in real_list_keys:
        # modify all keys starting with list_key to use i as the next key rather than whatever index it was
        if fixed_list_key.startswith(real_list_key):
            fixed_list_key = re.sub(rf'({real_list_key}),\d+(,?.*)', r'\1,i\2', fixed_list_key)

    # if the size of the list at the same index isn't constant, this is a real list type
    if (fixed_list_key in fixed_num_elements and len(fixed_num_elements[fixed_list_key]) > 1) or len(num_elements[fixed_list_key]) > 1:
        if fixed_list_key not in real_list_keys:
            print(f'found real list {fixed_list_key}')
            real_list_keys.append(fixed_list_key)
    else:
        # merge num_elements
        if fixed_list_key in fixed_num_elements:
            fixed_num_elements[fixed_list_key] |= num_elements[list_key]
        else:
            fixed_num_elements[fixed_list_key] = num_elements[list_key]

# do the same thing with values now
all_value_keys = sorted(values.keys(), key=lambda index: len(index))
fixed_values = defaultdict(lambda: set())
for value_key in all_value_keys:
    fixed_value_key = value_key
    for real_list_key in real_list_keys:
        if fixed_value_key.startswith(real_list_key):
            fixed_value_key = re.sub(rf'({real_list_key}),\d+(,?.*)', r'\1,i\2', fixed_value_key)
    if fixed_value_key in fixed_values:
        fixed_values[fixed_value_key] |= values[value_key]
    else:
        fixed_values[fixed_value_key] = values[value_key]

for value_key in sorted(fixed_values.keys(), key=lambda index: len(index)):
    values = fixed_values[value_key]
    if all([len(value) > 100 if isinstance(value, str) else True for value in values]):
        # print(f'{value_key} looks like jumbled data')
        continue
    
    print(f'{value_key}: {", ".join(map(str, fixed_values[value_key]))}')

found real list 11
found real list 17
found real list 26
found real list 2,0
found real list 3,0
found real list 7,1,1
found real list 7,2,0
found real list 7,4,0
found real list 17,i,2
found real list 1,0,0,0,2
found real list 1,0,1,0,2
found real list 2,0,i,0,2
found real list 2,0,i,0,5
found real list 3,0,i,0,1
found real list 3,0,i,0,2
found real list 3,0,i,0,5
found real list 3,0,i,0,8
found real list 3,0,i,0,24
found real list 3,0,i,0,13
found real list 2,0,i,0,13
found real list 2,0,i,0,22,9
found real list 3,0,i,0,22,9
found real list 2,0,i,0,2,i,8
found real list 3,0,i,0,2,i,8
found real list 2,0,i,0,2,i,12
found real list 2,0,i,0,2,i,10
found real list 3,0,i,0,2,i,12
found real list 3,0,i,0,2,i,10
found real list 2,0,i,0,2,i,18
found real list 3,0,i,0,2,i,18
0,3: kH_zZrWTE97P2O8P6ZKq8A4, sX_zZtX5J97P2O8P6ZKq8A4, vH_zZpzmA97P2O8P6ZKq8A4, mn_zZsa8CN3X2O8Pkom7uA0, pH_zZtigLaK02O8P4qmdoAk
0,4: HDdGhv_Vij6AAB7r2ABG---------oycfy7AAAAAGbzf6QLLN3WA, HIPLNX5javjwABkDNQBG---------oybg

Now, reverse engineering the indices should be much easier using `fixed_values`. Define more search pairs to get a wider set of data

In [109]:
keys = {
    0: {
        'name': 'flight_data',
        0: 'AIRLINE_IATA',
        1: {
            0: 'AIRLINE_NAME',
        },
        2: {
            'name': 'FLIGHT_LEG',
            'i': {
                2: 'FLIGHT_OPERATOR',
                3: 'FROM_ICAO',
                4: 'FROM_FULL_NAME',
                5: 'TO_FULL_NAME',
                6: 'TO_ICAO',
                8: {
                    0: 'DEPART_HOUR',
                    1: 'DEPART_MIN',
                },
                10: {
                    0: 'ARRIVE_HOUR',
                    1: 'ARRIVE_MIN',
                },
                14: 'SEAT_PITCH',
                17: 'AIRCRAFT',
                20: {
                    0: 'DEPART_YEAR',
                    1: 'DEPART_MONTH',
                    2: 'DEPART_DAY',
                },
                21: {
                    0: 'ARRIVE_YEAR',
                    1: 'ARRIVE_MONTH',
                    2: 'ARRIVE_DAY',
                },
                22: {
                    0: 'AIRLINE_IATA',
                    1: 'FLIGHT_NUMBER',
                    3: 'AIRLINE_NAME',
                },
            },
        },
        3: 'FROM_ICAO',
        4: {
            0: 'DEPART_YEAR',
            1: 'DEPART_MONTH',
            2: 'DEPART_DAY',
        },
        5: {
            0: 'DEPART_HOUR',
            1: 'DEPART_MIN',
        },
        6: 'TO_ICAO',
        7: {
            0: 'ARRIVE_YEAR',
            1: 'ARRIVE_MONTH',
            2: 'ARRIVE_DAY',
        },
        8: {
            0: 'ARRIVE_HOUR',
            1: 'ARRIVE_MIN',
        },
        9: 'PRICE',
        13: {
            'name': 'LAYOVER',
            'i': {
                0: 'MINUTES',
                1: 'DEPART_AIRPORT_ICAO',
                2: 'ARRIVE_AIRPORT_ICAO',
                4: 'DEPART_AIRPORT_NAME',
                5: 'DEPART_AIRPORT_CITY',
                6: 'ARRIVE_AIRPORT_NAME',
                7: 'ARRIVE_AIRPORT_CITY',
            },
        },
        22: ['EMISSIONS_DATA'],
    },
}

<fast_flights.schema.Trip at 0x7f67abc3b400>