### Merge daily dataframes to one single dataframe

In [1]:
# Imports
import pandas as pd
import pickle

In [2]:
# Define the date range
start_date = '2022-10-31'
end_date = '2022-11-22'

# Create a list of dates in YYYY-MM-DD format
dates = pd.date_range(start_date, end_date).strftime("%Y-%m-%d").tolist()
print(f"{len(dates)} dates included ({start_date} to {end_date}).")

23 dates included (2022-10-31 to 2022-11-22).


In [3]:
# Loop through the dates and read the pickle files. Each pickle file contains a list of dictionaries, each dictionary contains the details of one route.
processed_list = []
missing = 0
for date in dates:
    # Read the pickle for each date. It contains a list of dictionaries, each dictionary contains the details of one harbor combination.
    try:
        with open(f'../pickles/routescanner_daily_v2/connections_{date}.pickle', 'rb') as f:
            results_list = pickle.load(f)
    except:
        missing += 1
        print(f"Data for {date} missing!")

    # Unpack the list of dictionaries. Each dictionary contains the details of one route. The "Key" key contains another dictionary with the route details. Flatten it into the main dictionary. Put the new items at the beginning of the dictionary.
    for result in results_list:
        # Remove the "hash" and "requestId" keys, they are not needed.
        for key_name in ["hash", "requestId"]:
            result.pop(key_name)

        # Flatten the information in the "origin", "destination" dictionary.
        for dict_name in ["origin", "destination"]:
            new_dict = result[dict_name]
            # Add the dict_name to the keys of the new dictionary.
            new_dict = {f"{dict_name}_{key}": value for key, value in new_dict.items() if key != "type"}
            result = (new_dict | result)
            result.pop(dict_name)

        # Create a dictionary with the details that's identical for all routes in the list.
        non_route_specific_results = result.copy()
        non_route_specific_results.pop("voyages")

        # Loop through the list of routes, create a new dictionary for each route and append it to the list.
        routes_details_list = result['voyages']
        for route in routes_details_list:
            # Merge the non-route-specific details with the route-specific details.
            new_dict = non_route_specific_results.copy()
            new_dict["NumberOfLegs"] = len(route["legs"])
            new_dict = (new_dict | route)
            new_dict["ScrapingDate"] = date
            processed_list.append(new_dict)

In [4]:
# Create a dataframe from the list of dictionaries
df = pd.DataFrame(processed_list)

# Move the second last column to the beginning of the dataframe
df = df[[df.columns[-2]] + df.columns[:-2].tolist()]

print(f"Total number of routes: {len(df)}, spread over {len(dates)-missing} days.")

# Assign the optimal data types to the columns.
df = df.convert_dtypes()

# Drop rows which are identical, except for the ScrapingDate column.
rows_before = len(df)
columns_to_check = [col for col in df.columns.to_list() if col not in ["ScrapingDate", "legs", "transferCo2EmissionsInKg", "hash", "requestId"]]

# Convert dict to strings in the DataFrame
for col in columns_to_check:
    if df[col].apply(lambda x: isinstance(x, dict)).any():
        df[col] = df[col].apply(lambda x: str(x) if isinstance(x, dict) else x)

df = df.drop_duplicates(subset=columns_to_check, keep='first')

# Print the number of identical rows dropped.
print(f"{rows_before - len(df)} identical rows dropped, {1-(len(df)/rows_before):.2%} of the total number of rows, with {len(df)} remaining.")

Total number of routes: 22349, spread over 23 days.
7038 identical rows dropped, 31.49% of the total number of rows, with 15311 remaining.


In [5]:
df.head()

Unnamed: 0,ScrapingDate,destination_name,destination_location,destination_locode,origin_name,origin_location,origin_locode,totalResults,NumberOfLegs,totalTravelTimeInMinutes,totalDistanceInMeters,totalCo2EmissionsInKg,voyageStart,voyageEnd,transferCo2EmissionsInKg,legs
0,2022-10-31,Amsterdam,"{'lat': 52.37403, 'lng': 4.88969}",NLAMS,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,3,3,44912,12894138,1150,2022-11-03T22:30:00-03:00,2022-12-05T07:02:14+01:00,"[15, 15]",[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...
1,2022-10-31,Amsterdam,"{'lat': 52.37403, 'lng': 4.88969}",NLAMS,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,3,3,30676,13132075,1860,2022-11-06T13:32:30-05:00,2022-11-28T02:49:28+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o..."
2,2022-10-31,Amsterdam,"{'lat': 52.37403, 'lng': 4.88969}",NLAMS,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,3,3,28258,12962474,1885,2022-11-06T13:32:30-05:00,2022-11-26T10:30:48+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o..."
3,2022-10-31,Port of Antwerp,"{'lat': 51.249596, 'lng': 4.407942}",BEANR,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,2,2,39330,12629105,1045,2022-11-03T22:30:00-03:00,2022-12-01T10:00:00+01:00,[15],[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...
4,2022-10-31,Port of Antwerp,"{'lat': 51.249596, 'lng': 4.407942}",BEANR,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,2,2,27987,12804509,1750,2022-11-06T13:32:30-05:00,2022-11-26T06:00:00+01:00,[15],"[{'origin': {'type': 'locode', 'name': 'Port o..."


In [6]:
# Save the dataframe to a pickle file.
filename = "routescanner_v2_connections_combined"
df.to_pickle(f"../pickles/{filename}.pickle")
df.to_csv(f"../data/{filename}.csv")

In [7]:
pd.read_pickle("../pickles/routescanner_v2_connections_combined.pickle").head()

Unnamed: 0,ScrapingDate,destination_name,destination_location,destination_locode,origin_name,origin_location,origin_locode,totalResults,NumberOfLegs,totalTravelTimeInMinutes,totalDistanceInMeters,totalCo2EmissionsInKg,voyageStart,voyageEnd,transferCo2EmissionsInKg,legs
0,2022-10-31,Amsterdam,"{'lat': 52.37403, 'lng': 4.88969}",NLAMS,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,3,3,44912,12894138,1150,2022-11-03T22:30:00-03:00,2022-12-05T07:02:14+01:00,"[15, 15]",[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...
1,2022-10-31,Amsterdam,"{'lat': 52.37403, 'lng': 4.88969}",NLAMS,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,3,3,30676,13132075,1860,2022-11-06T13:32:30-05:00,2022-11-28T02:49:28+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o..."
2,2022-10-31,Amsterdam,"{'lat': 52.37403, 'lng': 4.88969}",NLAMS,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,3,3,28258,12962474,1885,2022-11-06T13:32:30-05:00,2022-11-26T10:30:48+01:00,"[15, 15]","[{'origin': {'type': 'locode', 'name': 'Port o..."
3,2022-10-31,Port of Antwerp,"{'lat': 51.249596, 'lng': 4.407942}",BEANR,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,2,2,39330,12629105,1045,2022-11-03T22:30:00-03:00,2022-12-01T10:00:00+01:00,[15],[{'origin': {'uuid': 'a9d25a4a-a784-417b-8196-...
4,2022-10-31,Port of Antwerp,"{'lat': 51.249596, 'lng': 4.407942}",BEANR,Port of Arica,"{'lat': -18.474563, 'lng': -70.32761}",CLARI,2,2,27987,12804509,1750,2022-11-06T13:32:30-05:00,2022-11-26T06:00:00+01:00,[15],"[{'origin': {'type': 'locode', 'name': 'Port o..."
