### Merge daily dataframes to one single dataframe

In [1]:
# Imports
import pandas as pd
import pickle

In [2]:
# Define the date range
start_date = '2022-10-31'
end_date = '2023-03-12'

# Create a list of dates in YYYY-MM-DD format
dates = pd.date_range(start_date, end_date).strftime("%Y-%m-%d").tolist()
print(f"{len(dates)} dates included ({start_date} to {end_date}).")

133 dates included (2022-10-31 to 2023-03-12).


In [3]:
# Loop through the dates and read the pickle files. Each pickle file contains a list of dictionaries, each dictionary contains the details of one route.
processed_list = []
missing = 0
for date in dates:
    # Read the pickle for each date. It contains a list of dictionaries, each dictionary contains the details of one harbor combination.
    try:
        with open(f'../pickles/msc_daily_v2/connections_{date}.pickle', 'rb') as f:
            results_list = pickle.load(f)
    except:
        missing += 1
        print(f"Data for {date} missing!")

    # Unpack the list of dictionaries. Each dictionary contains the details of one route. The "Key" key contains another dictionary with the route details. Flatten it into the main dictionary. Put the new items at the beginning of the dictionary.
    for result in results_list:
        # Flatten the information in the "Keys" dictionary.
        result = (result['Key'] | result)
        result.pop('Key')

        # Create a dictionary with the details that's identical for all routes in the list.
        non_route_specific_results = result.copy()
        non_route_specific_results.pop("Routes")

        # Loop through the list of routes, create a new dictionary for each route and append it to the list.
        routes_details_list = result['Routes']
        for route in routes_details_list:
            # Merge the non-route-specific details with the route-specific details.
            new_dict = non_route_specific_results.copy()
            new_dict["NumberOfLegs"] = len(route["RouteScheduleLegDetails"])
            new_dict = (new_dict | route)
            new_dict["ScrapingDate"] = date
            processed_list.append(new_dict)

Data for 2022-11-10 missing!
Data for 2022-11-23 missing!
Data for 2022-11-24 missing!
Data for 2022-11-25 missing!
Data for 2022-11-26 missing!
Data for 2022-11-27 missing!
Data for 2022-11-28 missing!
Data for 2022-12-14 missing!
Data for 2023-02-07 missing!
Data for 2023-03-09 missing!


In [4]:
# Create a dataframe from the list of dictionaries
df = pd.DataFrame(processed_list)

# Move the second last and third last columns to the beginning of the dataframe
cols = df.columns.tolist()
cols = [cols[-1]] + cols[-3:-1] + cols[:-3]
df = df[cols]

print(f"Total number of routes: {len(df)}, spread over {len(dates)-missing} days.")

# Assign the optimal data types to the columns.
df = df.convert_dtypes()

# Drop rows which are identical, except for the ScrapingDate column.
rows_before = len(df)
columns_to_check = [col for col in df.columns.to_list() if col not in ["ScrapingDate", "CutOffs", "RouteScheduleLegDetails"]]

df = df.drop_duplicates(subset=columns_to_check, keep='first')

# Print the number of identical rows dropped.
print(f"{rows_before - len(df)} identical rows dropped, {1-(len(df)/rows_before):.2%} of the total number of rows.")

Total number of routes: 126918, spread over 123 days.
54296 identical rows dropped, 42.78% of the total number of rows.


In [5]:
# Save the dataframe to a pickle file.
filename = "msc_v2_connections_combined"
df.to_pickle(f"../pickles/{filename}.pickle")
df.to_csv(f"../data/{filename}.csv")