In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors


make sure to rename the columns by removing é


# Exploratory Data Analysis (EDA) for Packages and Receptacle Datasets

In [None]:
# Load datasets
packages_df = pd.read_csv('../data/raw/packages_data_2023_2025.csv',delimiter=';', encoding='latin-1')
receptacles_df = pd.read_csv('../data/raw/receptacle_data_2023_2025.csv',delimiter=';', encoding='latin-1')

* Columns' names adjustment

In [None]:
packages_df = packages_df.rename(columns={'établissement_postal': 'etablissement_postal', 'next_établissement_postal': 'next_etablissement_postal'})
receptacles_df = receptacles_df.rename(columns={'ï»¿RECPTCL_FID': 'RECPTCL_FID', 'EVENT_TYPECD': 'EVENT_TYPE_CD', 'nextetablissement_postal': 'next_etablissement_postal'})

In [None]:
packages_df.shape, receptacles_df.shape

* Columns' types adjustment

In [None]:
packages_df['date'] = pd.to_datetime(packages_df['date'])
receptacles_df['date'] = pd.to_datetime(receptacles_df['date'])
packages_df['RECPTCL_FID'] = packages_df['RECPTCL_FID'].str.strip()
packages_df['MAILITM_FID'] = packages_df['MAILITM_FID'].str.strip()
packages_df['etablissement_postal'] = packages_df['etablissement_postal'].str.strip()
packages_df['next_etablissement_postal'] = packages_df['next_etablissement_postal'].str.strip()
receptacles_df['etablissement_postal'] = receptacles_df['etablissement_postal'].str.strip()
receptacles_df['next_etablissement_postal'] = receptacles_df['next_etablissement_postal'].str.strip()
receptacles_df['RECPTCL_FID'] = receptacles_df['RECPTCL_FID'].str.strip()

In [None]:
packages_df.info()


In [None]:
receptacles_df.info()


In [None]:
packages_df.head(n=5)

In [None]:
receptacles_df.head(n=5)

 Initial Observations
- both datasets cover the period from 2023 to 2025
- we have no target variable in either dataset
- for packages dataset:
    - 6 features in total with 5 categorical and 1 numerical
    - MAILITM_FID is unique identifier for each package
    - RECPTCL_FID is foreign key linking to receptacle dataset
    - etablissement_postal and next_etablissement_postal have some null values
- for receptacle dataset:
    - 5 features in total with 4 categorical and 1 numerical
    - RECPTCL_FID is unique identifier for each receptacle
    - EVENT_TYPE_CD has some null values

In [None]:
for column in packages_df.columns:
    print(f'{column} has {packages_df[column].nunique()} unique values.')
    print(f'{column} has {packages_df[column].isnull().sum()} null values.')
    print("\n")


In [None]:
for column in receptacles_df.columns:
    print(f'{column} has {receptacles_df[column].nunique()} unique values.')
    print(f'{column} has {receptacles_df[column].isnull().sum()} null values.')
    print("\n")

we notice the following:<br>
- receptacle dataset has more unique values for RECPTCL_FID than packages dataset, indicating one-to-many relationship<br>
- MAILITM_FID is unique in packages dataset.<br>
- packages dataset have more unique date values than receptacle dataset.<br>
- both datasets have null values in etablissement_postal and next_etablissement_postal columns. This requires processing later on<br>
- packages dataset has more unique values in the next_etablissement_postal column compared to receptacle dataset but also more null values. **further investigation is needed to understand why**<br>


In [None]:
packages_df.describe()

In [None]:
receptacles_df.describe()

for EVENT_TYPE_CD we notice different range of values for packages and receptacle datasets indicating different types of events.<br>

#### Visualizations
for now we will visualize the distribution of EVENT_TYPE_CD in both datasets.<br>



In [None]:
# Distribution of EVENT_TYPE_CD in packages dataset
plt.figure(figsize=(12, 6))
sns.countplot(data=packages_df,x='EVENT_TYPE_CD')
plt.title('distribution of EVENT_TYPE_CD in packages dataset')
plt.xlabel('EVENT_TYPE_CD')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Distribution of EVENT_TYPE_CD in receptacle dataset
plt.figure(figsize=(12, 6))
sns.countplot(data=receptacles_df,x='EVENT_TYPE_CD')
plt.title('distribution of EVENT_TYPE_CD in Receptacle dataset')
plt.xlabel('EVENT_TYPE_CD')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

for etablissement_postal and next_etablissement_postal I will start with visualizing the receptacle dataset since the packages dataset has a lot of unique values<br>

In [None]:
# Distribution of etablissement_postal in receptacle dataset
plt.figure(figsize=(12, 6))
sns.countplot(data=receptacles_df,x='etablissement_postal')
plt.title('distribution of etablissement_postal in Receptacle dataset')
plt.xlabel('etablissement_postal')
plt.ylabel('count')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Distribution of next_etablissement_postal in receptacle dataset
plt.figure(figsize=(12, 6))
sns.countplot(data=receptacles_df,x='next_etablissement_postal')
plt.title('distribution of next_etablissement_postal in Receptacle dataset')
plt.xlabel('next_etablissement_postal')
plt.ylabel('count')
plt.xticks(rotation=90)
plt.show()

In [None]:
# 1.  Top 10 locations to keep the map small
top_locs = receptacles_df['etablissement_postal'].value_counts().head(10).index
top_next = receptacles_df['next_etablissement_postal'].value_counts().head(10).index

# 2. include only these top locations
filtered_df = receptacles_df[
    (receptacles_df['etablissement_postal'].isin(top_locs)) &
    (receptacles_df['next_etablissement_postal'].isin(top_next))
]

# 3. Create a Matrix (Cross-tabulation)
matrix = pd.crosstab(filtered_df['etablissement_postal'], filtered_df['next_etablissement_postal'])

# 4. Plot Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(matrix, cmap='Reds', annot=True, fmt='d', linewidths=.5)
plt.title("Most Frequent Routes (Origin vs Destination) - Receptacle Dataset")
plt.xlabel("Next Destination")
plt.ylabel("Current Location")
plt.show()

we notice that some etablissements have significantly higher traffic compared to others, indicating  major distribution centers.<br>

for etablissement_postal and next_etablissement_postal we will create a heatmap to visualize the flow between current location and next destination.<br>
Count of parcels moving from A to B to see the density of connections between them

In [None]:
# 1.  Top 10 locations to keep the map small
top_locs = packages_df['etablissement_postal'].value_counts().head(10).index
top_next = packages_df['next_etablissement_postal'].value_counts().head(10).index

# 2. include only these top locations
filtered_df = packages_df[
    (packages_df['etablissement_postal'].isin(top_locs)) &
    (packages_df['next_etablissement_postal'].isin(top_next))
]

# 3. Create a Matrix (Cross-tabulation)
matrix = pd.crosstab(filtered_df['etablissement_postal'], filtered_df['next_etablissement_postal'])

# 4. Plot Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(matrix, cmap='Reds', annot=True, fmt='d', linewidths=.5)
plt.title("Most Frequent Routes (Origin vs Destination) - Packages Dataset")
plt.xlabel("Next Destination")
plt.ylabel("Current Location")
plt.show()

In [None]:
# Count packages per location
location_counts = packages_df['etablissement_postal'].value_counts().reset_index()
location_counts.columns = ['Location', 'Volume']

# keep only top 20 busiest centers
plt.figure(figsize=(12, 6))
sns.barplot(x='Volume', y='Location', data=location_counts.head(20), palette='viridis')
plt.title("Top 20 Busiest Postal Centers")
plt.xlabel("Number of Packages")
plt.ylabel("Center ID")
plt.show()

we notice the same pattern as before with some etablissements having significantly higher trafic compared to others.<br>

In [None]:

# 1. Extract Time Features
packages_df['hour'] = packages_df['date'].dt.hour
packages_df['day_of_week'] = packages_df['date'].dt.day_name()

# 2. Create a Pivot Table (Cross-tabulation)
# Rows = Day, Cols = Hour, Values = Count of Scans
heatmap_data = pd.crosstab(
    packages_df['day_of_week'],
    packages_df['hour']
)

days_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
heatmap_data = heatmap_data.reindex(days_order)

# 3. Plot
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap='YlOrRd', linewidths=.5, annot=False)
plt.title("Package Scan Activity by Day and Hour")
plt.xlabel("Hour of Day (0-23)")
plt.ylabel("Day of Week")
plt.show()

we notice that the busiest times for package scans are during weekdays, particularly from mid-morning to late afternoon.<br>

In [None]:
#number of packages per receptacle
packages_per_receptacle = packages_df.groupby('RECPTCL_FID')['MAILITM_FID'].nunique()
packages_per_receptacle.head()


# Preprocessing

In [None]:
packages_df.info()

In [None]:
receptacles_df.info()

- Drop the packages starting from 2020 and keep only the ones starting from 2023

In [None]:
#in packages df Drop the values that are from 2020 and start only from 2023
packages_df = packages_df[packages_df['date'].dt.year >= 2023]

In [None]:
packages_df['etablissement_postal'].isna().sum()

* `etablissement_postal` have 26772 null values (2.7% of the whole dataset)
* As its null values are less than 5% of the dataset (2.7%), we drop these null values

In [None]:
packages_df = packages_df[~packages_df['etablissement_postal'].isna()]
packages_df['etablissement_postal'].isna().sum()

* We propose to consider the packages having null `next_etablissement_postal`
as having issue during transfer, we'll try to validate that using
`EVENT_TYPE_CD` also
* Let's check if `EVENT_TYPE_CD` can indicate whether the `next_etablissement_postal` is null or not

In [None]:
packages_unknown_next_etablissement = packages_df[packages_df['next_etablissement_postal'].isna()]
# keep only top EVENT_TYPES_ID
packages_unknown_next_etablissement = packages_unknown_next_etablissement['EVENT_TYPE_CD'].value_counts()

plt.figure(figsize=(12, 6))
packages_unknown_next_etablissement.head(10).plot(kind='bar')
plt.xlabel('EVENT TYPE CD')
plt.ylabel('Null Next Etablissement Postal')
plt.xticks(rotation=0)
plt.show()

* `EVENT_TYPE_CD` doesn't actually indicate null values of `next_etablissement_postal`

* the function `fill_NaN_next_etab` cell fills the `next_etablissement_postal` using the next `etablissement_postal` for the same package.
* if the last route for a specific package is null, then it keeps it null because there's no next `etablissement_postal` for that package.

In [None]:
def fill_NaN_next_etab(df, id_col):
    # 1. Ensure the dataframe is sorted (same as before)
    df = df.sort_values([id_col, 'date'])

    # 2. Look ahead to the next row's postal code and ID
    shifted_postal = df['etablissement_postal'].shift(-1)
    shifted_id = df[id_col].shift(-1)
# 3. Identify the "boundaries" where the postal code changes within the same package
# This marks the last row of a block with the value of the start of the next block
    is_boundary = (df['etablissement_postal'] != shifted_postal) & \
              (df[id_col] == shifted_id)
# 4. Use grouped backfill to broadcast those values to all preceding rows in the block
# This replaces your 'blocks.map' logic with a single vectorized pass
    fill_values = shifted_postal.where(is_boundary).groupby(df[id_col]).bfill()

# 5. Fill only the NaNs in the existing column to match your original logic
    df['next_etablissement_postal'] = df['next_etablissement_postal'].fillna(fill_values)
    
    return df

In [None]:
# Apply the function to fill NaN values in next_etablissement_postal
packages_df = fill_NaN_next_etab(packages_df, 'MAILITM_FID')
# Check remaining NaNs
packages_df.info()

* Like this, we've handled a good part of null values and inconsitencies for `packages` dataset.

* **We'll be doing the same steps for `receptacle` dataset**

In [None]:
receptacles_df['etablissement_postal'].isna().sum()

* Dropping rows having null `etablissement_postal`, as they're just 0.1% of the data

In [None]:
receptacles_df = receptacles_df[~receptacles_df['etablissement_postal'].isna()]
receptacles_df['etablissement_postal'].isna().sum()

* apply the function that fills null values of `next_etablissement_postal` using `etablissement_postal` to `receptacles_df`

In [None]:
receptacles_df = fill_NaN_next_etab(receptacles_df, 'RECPTCL_FID')
# Check remaining NaNs
receptacles_df.info()

* Null values are mostly gone, but there are still some illogical packages' and receptacles' routes between `etablissements`
* We'll treat these logical routes now

In [None]:
# for each package (group of rows), check whether there's any illogical route
# between 'etablissement_postal' and 'next_etablissement_postal'
def isPackageIllogical(group):
    return (
        group['next_etablissement_postal']
        .iloc[:-1]
        .ne(group['etablissement_postal'].shift(-1).iloc[:-1])
        .any()
    )

In [None]:
illogical_packages = packages_df.groupby('MAILITM_FID').apply(isPackageIllogical)
illogical_packages.value_counts()

In [None]:
103376 / packages_df['MAILITM_FID'].nunique()

* 103376 Packages have illogical routes (98%) of all packages, so it's impossible to drop them, but instead, we plan to ignore the `MAILITM_FID` and `RECPTCL_FID` in the training and testing sets that will come next

In [None]:
# for each receptacle (group of rows), check whether there's any illogical route
# between 'etablissement_postal' and 'next_etablissement_postal'
def isReceptacleIllogical(group):
    return (
        group['next_etablissement_postal']
        .iloc[:-1]
        .ne(group['etablissement_postal'].shift(-1).iloc[:-1])
        .any()
    )

In [None]:
illogical_receptacles = receptacles_df.groupby('RECPTCL_FID').apply(isReceptacleIllogical)
illogical_receptacles.value_counts()

In [None]:
205519 / receptacles_df['RECPTCL_FID'].nunique()

205519 receptacles have illogical routes (95%) of all receptacles, so it's also impossible to drop them

In [None]:
# keep copies for backup (en cas ou)
packages_df_copy = packages_df.copy()
receptacles_df_copy = receptacles_df.copy()

# Feature Extraction 

### Check RECPTCL_FID and MAILITM_FID having same length formats
if yes then we can split them into meaningfull parts

In [None]:
same=1
print("\n=== RECPTCL_FID  ===")
print(f"testing if the lengths of RECPTCL_FID values are all the same:")
for val in packages_df['RECPTCL_FID'].values:
    if len(str(val)) != 29 :
        print(f"  {val} (length: {len(str(val))})")
        same=0
        break
if same==1:
    print('all same length' )

same=1
print("\n=== MAILITM_FID  ===")
print(f"testing if the lengths of MAILITM_FID values are all the same:")
for val in packages_df['MAILITM_FID'].values:
    if len(str(val)) != 13 :
        print(f"  {val} (length: {len(str(val))})")
        same=0
        break
if same==1:
    print ('all same length' )



### RECPTCL_FID Analysis
- **Format:** 29-character string (e.g., `USORDADZALGDAUN30050001900005`)
- **Data Quality:** No null values (1,000,000) | 215,867 unique values in receptacle dataset and 45306 unique values in packages dataset
- **Extractable Features:**
  - Origin Country (2 chars): US, FR, AE, etc.
  - Destination Country (2 chars): DZ, AI, AA, etc.

### MAILITM_FID Analysis
- **Format:** 13-character string according to the S10-12 patern (e.g., `CA000132868US`, `CA000340856PK`)
- **Data Quality:** No null values (1,000,000 packages)
- **Extractable Features:**
  - Service Indicator (2 chars): CA, etc.
  - Serial Number (8 chars): 00013286, 00034085, etc.
  - Check Digit (1 char): 8, 6, etc.
  - Country Code (3 chars, right-stripped): US, PK.


## Definition of the parser funtions
These functions are responsible for spliting the IDs into parts

In [None]:
def parse_recptcl_fid(id_str):
    origin_country = id_str[0:2]
    destination_country = id_str[6:8]
    return origin_country, destination_country

def parse_mailitm_fid(id_str):
    service_indicator = id_str[0:2]
    serial_number = id_str[2:11]
    country_code = id_str[11:14].strip()
    return service_indicator, serial_number, country_code


### Apply parsing functions

In [None]:
# parsed_receptacles_df = receptacles_df.copy()
# parsed_receptacles_df[['origin_country', 'destination_country']] = parsed_receptacles_df['RECPTCL_FID'].apply(lambda x: pd.Series(parse_recptcl_fid(x)))

# Faster method:
parsed_data = list(receptacles_df['RECPTCL_FID'].apply(parse_recptcl_fid))

# Assign to new columns by creating a temporary DataFrame
parsed_receptacles_df = receptacles_df.copy()
parsed_receptacles_df[['origin_country', 'destination_country']] = pd.DataFrame(
    parsed_data, index=receptacles_df.index
)

In [None]:
# parsed_packages_df = packages_df.copy()
# parsed_packages_df[['service_indicator', 'serial_number', 'country_code']] = parsed_packages_df['MAILITM_FID'].apply(lambda x: pd.Series(parse_mailitm_fid(x)))
# parsed_packages_df[['origin_country','destination_country']] = parsed_packages_df['RECPTCL_FID'].apply(lambda x: pd.Series(parse_recptcl_fid(x)))

# Faster method:
parsed_packages_df = packages_df.copy()

# 1. Optimize MAILITM_FID parsing
mailitm_data = list(parsed_packages_df['MAILITM_FID'].apply(parse_mailitm_fid))
parsed_packages_df[['service_indicator', 'serial_number', 'country_code']] = pd.DataFrame(
    mailitm_data, index=parsed_packages_df.index
)

# 2. Optimize RECPTCL_FID parsing
recptcl_data = list(parsed_packages_df['RECPTCL_FID'].apply(parse_recptcl_fid))
parsed_packages_df[['origin_country', 'destination_country']] = pd.DataFrame(
    recptcl_data, index=parsed_packages_df.index
)

### show samples of new parsed data

In [None]:

print("=== packages_df sample with new parsed columns ===")
parsed_packages_df.head()



In [None]:
print("\n=== receptacles_df sample with new parsed columns ===")
parsed_receptacles_df.head()

# Analysis of the extrcted features


## 1. parsed_packages_df

In [None]:

print("\n--- Unique Value Counts for parsed_packages_df ---")
print("\nFor receptacle FID parsing:")
print(f"Unique origin_country values: {parsed_packages_df['origin_country'].nunique()}")
print(f"Unique destination_country values: {parsed_packages_df['destination_country'].nunique()}")

print("="*50)
print("\nFor mail item FID parsing:")
print(f"Unique service_indicator values: {parsed_packages_df['service_indicator'].nunique()}")
print(f"Unique country_code values: {parsed_packages_df['country_code'].nunique()}")



## 2. parsed_receptacles_df 

In [None]:
print("\n--- Unique Value Counts for parsed_receptacles_df ---")
print(f"Unique origin_country values: {parsed_receptacles_df['origin_country'].nunique()}")
print(f"Unique destination_country values: {parsed_receptacles_df['destination_country'].nunique()}")


## List values of the new columns obtained from receptacle FID parsing for both parsed dataframes

### 1. for parsed_packages_df

In [None]:
#listing the values 
print("\n--- Values of origin_country ---")
print(parsed_packages_df['origin_country'].unique())

print("\n--- Values of destination_country ---")
print(parsed_packages_df['destination_country'].unique())


### 2. for parsed_receptacles_df

In [None]:
#listing the values 
print("\n--- Values of origin_country ---")
print(parsed_receptacles_df['origin_country'].unique())

print("\n--- Values of destination_country ---")
print(parsed_receptacles_df['destination_country'].unique())


### Do the intersection of origin_country of both dataframes

In [None]:
#do the intersection of origin_country values in both parsed datasets
packages_origin_countries = set(parsed_packages_df['origin_country'].unique())
receptacle_origin_countries = set(parsed_receptacles_df['origin_country'].unique())
common_origin_countries = packages_origin_countries.intersection(receptacle_origin_countries)
print("number of common origin_country values in both parsed datasets:", len(common_origin_countries))
print(f"\nCommon origin_country values in both paesed datasets: ")
print(common_origin_countries)
# remaining ones 
remaining_in_packages = packages_origin_countries - common_origin_countries
remaining_in_receptacle = receptacle_origin_countries - common_origin_countries
print(f"Remaining origin_country values only in parsed_packages_df:")
print(remaining_in_packages)
print(f"Remaining origin_country values only in parsed_receptacles_df:")
print(remaining_in_receptacle )

### list the values of both service indicators and country code

### 1. service indicators

In [None]:
print("\n--- Values of service_indicator ---")
print(parsed_packages_df['service_indicator'].unique())



 we can see that there are values that don't follow the standards in the S10-12 format so we need to handle that correctly

In [None]:
#transform country_code to uppercase for consistency
parsed_packages_df['service_indicator'] = parsed_packages_df['service_indicator'].str.upper()
print("values of service_indicator after transformation to uppercase:")
print(parsed_packages_df['service_indicator'].unique())
print("number of unique service indicators after transformation:", parsed_packages_df['service_indicator'].nunique())


### 2. country code

In [None]:
print("\n--- Values of country codes ---")

print(parsed_packages_df['country_code'].unique())

we can see that many values for the country codes are numbers instead of ISO 3166-1 format these values should be replaced by the values of origin country gotten from the receptacle when doing the preprocessing

### replace them with the correct origin country code

In [None]:
# 1. Vectorized string capitalization
parsed_packages_df['country_code'] = parsed_packages_df['country_code'].str.upper()

# 2. Vectorized comparison to find mismatches
mismatch_mask = parsed_packages_df['origin_country'] != parsed_packages_df['country_code']

# 3. Count the Trues
count = mismatch_mask.sum()

print(f"Number of rows where origin_country does not match country_code: {count}")

### replace them

In [None]:
# Use .loc to find rows where they don't match, and update only the 'country_code' column
parsed_packages_df.loc[parsed_packages_df['origin_country'] != parsed_packages_df['country_code'], 'country_code'] = parsed_packages_df['origin_country']

In [None]:
#print the unique values again
print("\n--- Values of country codes after correction ---")
print(parsed_packages_df['country_code'].unique())
print("number of unique country codes after correction:", parsed_packages_df['country_code'].nunique())

## visualization of Origin Country distribution according to number of packages

### 1. for the parsed_packages_df

In [None]:
origin_country_counts = parsed_packages_df['origin_country'].value_counts().head(20)
plt.figure(figsize=(12, 7))
plt.barh(origin_country_counts.index, origin_country_counts.values, color='steelblue')
plt.xlabel('Count', fontsize=11)
plt.ylabel('Origin Country', fontsize=11)
plt.title('Top 20 Origin Countries by packages count', fontsize=12, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### 2. for the parsed_receptacles_df

In [None]:
origin_country_counts = parsed_receptacles_df['origin_country'].value_counts().head(20)
plt.figure(figsize=(12, 7))
plt.barh(origin_country_counts.index, origin_country_counts.values, color='steelblue')
plt.xlabel('Count', fontsize=11)
plt.ylabel('Origin Country', fontsize=11)
plt.title('Top 20 Origin Countries by receptacle count', fontsize=12, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Visualiation of the service indicator

In [None]:
service_indicator_count = parsed_packages_df['service_indicator'].value_counts().head(20)
plt.figure(figsize=(12, 7))
plt.barh(service_indicator_count.index, service_indicator_count.values, color='mediumseagreen')
plt.xlabel('Count', fontsize=11)
plt.ylabel('Service Indicator', fontsize=11)
plt.title('Top 20 Service Indicators by packages count', fontsize=12, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Origin–Destination Flow Analysis

This section investigates the flow of receptacles and packages from origin countries to destination. We examine:
- packages count by origin country
- Top origin countries delivering to each destination
- Visual representation 

### 1. parsed_packages_df

In [None]:
# packages count by origin country
origin_country_volume = parsed_packages_df['origin_country'].value_counts()
print("\n--- Packages count by Origin Country ---")
print(origin_country_volume.head(15))

# Create origin_country × destination matrix
flow_matrix = pd.crosstab(parsed_packages_df['origin_country'], 
                           parsed_packages_df['destination_country'])

# Keep only top 10 origin countries and top 10 destination countries for readability
top_origins = parsed_packages_df['origin_country'].value_counts().head(10).index
top_arrivals = parsed_packages_df['destination_country'].value_counts().head(10).index

flow_matrix_top = flow_matrix.loc[top_origins, top_arrivals]

print("\n--- Origin Country × destination Country (Top 10 × Top 10) ---")
print(flow_matrix_top)


In [None]:
# Visualize origin × destination country flow
plt.figure(figsize=(14, 8))
sns.heatmap(flow_matrix_top, cmap='YlOrRd', annot=True, fmt='d', 
            cbar_kws={'label': 'packages Count'}, linewidths=0.5)
plt.title('packages Flow: Origin Country × destination country (Top 10 × Top 10)', 
          fontsize=14, fontweight='bold')
plt.xlabel('Arrival Hub', fontsize=12)
plt.ylabel('Origin Country', fontsize=12)
plt.tight_layout()
plt.show()


### 2. parsed_receptacles_df

In [None]:
# receptacle count by origin country
origin_country_receptacle_volume = parsed_receptacles_df['origin_country'].value_counts()
print("\n--- Receptacle count by Origin Country ---")
print(origin_country_receptacle_volume.head(15))

destination__receptacle_volume=parsed_receptacles_df['destination_country'].value_counts()
print("\n--- Receptacle count by destination ---")
print(destination__receptacle_volume.head(15))

# Create origin_country × destination matrix
receptacle_flow_matrix = pd.crosstab(parsed_receptacles_df['origin_country'], 
                           parsed_receptacles_df['destination_country'])

# Keep only top 10 origin countries and top 10 destination countries for readability
receptacle_top_origins = parsed_receptacles_df['origin_country'].value_counts().head(10).index
receptacle_top_arrivals = parsed_receptacles_df['destination_country'].value_counts().head(10).index

receptacle_flow_matrix_top = receptacle_flow_matrix.loc[receptacle_top_origins, receptacle_top_arrivals]

print("\n--- Origin Country × destination Country (Top 10 × Top 10) ---")
print(receptacle_flow_matrix_top)


In [None]:
# Visualize origin × destination country flow
plt.figure(figsize=(14, 8))
sns.heatmap(receptacle_flow_matrix_top, cmap='YlOrRd', annot=True, fmt='d', 
            cbar_kws={'label': 'receptacles Count'}, linewidths=0.5)
plt.title('receptacles Flow: Origin Country × destination country (Top 10 × Top 10)', 
          fontsize=14, fontweight='bold')
plt.xlabel('Arrival Hub', fontsize=12)
plt.ylabel('Origin Country', fontsize=12)
plt.tight_layout()
plt.show()


### create pairs (origin, destination) for more detailed analysis

In [None]:
parsed_packages_df['origin_destination'] = parsed_packages_df['origin_country'] + '_' + parsed_packages_df['destination_country']
parsed_receptacles_df['origin_destination'] = parsed_receptacles_df['origin_country'] + '_' + parsed_receptacles_df['destination_country']


listing the obtained values 

In [None]:
print('(origin_destination) pairs obtained for ')
print("\nfor parsed_packages_df :")
print("\nNumber of unique values", parsed_packages_df['origin_destination'].nunique())
print("\nValues : ")
print(parsed_packages_df['origin_destination'].unique())
print("\nfor parsed_receptacles_df :")
print("\nNumber of unique values", parsed_receptacles_df['origin_destination'].nunique())
print("\nValues : ")
print(parsed_receptacles_df['origin_destination'].unique())


### visualization of obtained results 

### 1. parsed_packages_df

In [None]:
origin_dest_counts = parsed_packages_df['origin_destination'].value_counts()
top_origin_dest_counts= origin_dest_counts.head(15)
plt.figure(figsize=(14, 7))
top_origin_dest_counts.plot(kind='bar')
plt.title('Histogram of (origin, destination) Pairs Based on Package Counts')
plt.xlabel('Origin_Destination')
plt.ylabel('Number of Packages')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()



### 2. parsed_receptacles_df

In [None]:
origin_dest_counts = parsed_receptacles_df['origin_destination'].value_counts()
top_origin_dest_counts = origin_dest_counts.head(15)

plt.figure(figsize=(14, 7))
top_origin_dest_counts.plot(kind='bar')
plt.title('Histogram of (origin, destination) Pairs Based on receptacle Counts')
plt.xlabel('Origin_Destination')
plt.ylabel('Number of receptacles')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


## origin_destination X etablissments analysis 

## 1. current etablissment 

### a. parsed_packages_df

In [None]:
# For parsed_packages_df: visualize the histogram of counts by (origin_destination, etablissement_postal) pairs

# First, select top N most common (origin_destination, etablissement_postal) pairs for clarity
pair_counts_current = parsed_packages_df.groupby(['origin_destination', 'etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs_current = pair_counts_current.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs_current,
    x='count',
    y=top_pairs_current.apply(lambda x: f"{x['origin_destination']} | {x['etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Origin-Destination, Current Etablissement) Pairs by Package Counts')
plt.xlabel('Number of Packages')
plt.ylabel('(Origin_Destination | Current Etablissement)')
plt.tight_layout()
plt.show()


### b. parsed_receptacles_df

In [None]:
# For parsed_receptacles_dffffff: visualize the histogram of counts by (origin_destination, etablissement_postal) pairs

# First, select top N most common (origin_destination, etablissement_postal) pairs for clarity
pair_counts_current_receptacle = parsed_receptacles_df.groupby(['origin_destination', 'etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs_current_receptacle = pair_counts_current_receptacle.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs_current_receptacle,
    x='count',
    y=top_pairs_current_receptacle.apply(lambda x: f"{x['origin_destination']} | {x['etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Origin-Destination, Current Etablissement) Pairs by Receptacle Counts')
plt.xlabel('Number of Receptacles')
plt.ylabel('(Origin_Destination | Current Etablissement)')
plt.tight_layout()
plt.show()


we can see that the ETAB0002 is dominating and we remark that when the destination is DZ
we'll try to confirm that by taking into consideration the destination only


In [None]:
# For parsed_receptacles_dffff: visualize the histogram of counts by (destination_country, etablissement_postal) pairs

# First, select top N most common (destination_country, etablissement_postal) pairs for clarity
pair_counts_dest_receptacle = parsed_receptacles_df.groupby(['destination_country', 'etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs_dest_receptacle = pair_counts_dest_receptacle.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs_dest_receptacle,
    x='count',
    y=top_pairs_dest_receptacle.apply(lambda x: f"{x['destination_country']} | {x['etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Destination Country, Current Etablissement) Pairs by Receptacle Counts')
plt.xlabel('Number of Receptacles')
plt.ylabel('(Destination Country | Current Etablissement)')
plt.tight_layout()
plt.show()


### This is to test the origin 

In [None]:
# For parsed_receptacles_dffffff: visualize the histogram of counts by (origin_country, etablissement_postal) pairs

# First, select top N most common (origin_country, etablissement_postal) pairs for clarity
pair_counts_origin_receptacle = parsed_receptacles_df.groupby(['origin_country', 'etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs_origin_receptacle = pair_counts_origin_receptacle.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs_origin_receptacle,
    x='count',
    y=top_pairs_origin_receptacle.apply(lambda x: f"{x['origin_country']} | {x['etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Origin Country, Current Etablissement) Pairs by Receptacle Counts')
plt.xlabel('Number of Receptacles')
plt.ylabel('(Origin Country | Current Etablissement)')
plt.tight_layout()
plt.show()


most of the values with ETAB0002 values are european countries in addition to AE and China(CN)

## 2. Next etablissement

### a. parsed_packages_df

In [None]:
# For parsed_packages_df: visualize the histogram of counts by (origin_destination, next_etablissement_postal)

# First, select top N most common (origin_destination, next_etablissement_postal) pairs for clarity
pair_counts = parsed_packages_df.groupby(['origin_destination', 'next_etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs = pair_counts.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs,
    x='count',
    y=top_pairs.apply(lambda x: f"{x['origin_destination']} | {x['next_etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Origin-Destination, Next Etablissement) Pairs by Package Counts')
plt.xlabel('Number of Packages')
plt.ylabel('(Origin_Destination | Next Etablissement)')
plt.tight_layout()
plt.show()


### b. parsed_receptacles_df

In [None]:
# For parsed_receptacles_dffff: visualize the histogram of counts by (origin_destination, next_etablissement_postal)

# First, select top N most common (origin_destination, next_etablissement_postal) pairs for clarity
pair_counts_recept = parsed_receptacles_df.groupby(['origin_destination', 'next_etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs_recept = pair_counts_recept.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs_recept,
    x='count',
    y=top_pairs_recept.apply(lambda x: f"{x['origin_destination']} | {x['next_etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Origin-Destination, Next Etablissement) Pairs by Receptacle Counts')
plt.xlabel('Number of Receptacles')
plt.ylabel('(Origin_Destination | Next Etablissement)')
plt.tight_layout()
plt.show()



Do for origin and for destination separately

In [None]:

# For parsed_receptacles_df: visualize the histogram of counts by destination_country and next_etablissement_postal

# First, select top N most common (destination_country, next_etablissement_postal) pairs for clarity
pair_counts_dest = parsed_receptacles_df.groupby(['destination_country', 'next_etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs_dest = pair_counts_dest.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs_dest,
    x='count',
    y=top_pairs_dest.apply(lambda x: f"{x['destination_country']} | {x['next_etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Destination Country, Next Etablissement) Pairs by Receptacle Counts')
plt.xlabel('Number of Receptacles')
plt.ylabel('(Destination_Country | Next Etablissement)')
plt.tight_layout()
plt.show()


In [None]:
# For parsed_receptacles_df: visualize the histogram of counts by origin_country and next_etablissement_postal

# First, select top N most common (origin_country, next_etablissement_postal) pairs for clarity
pair_counts_origin = parsed_receptacles_df.groupby(['origin_country', 'next_etablissement_postal']).size().reset_index(name='count')

# Sort and keep only top N pairs (by count)
top_n = 20
top_pairs_origin = pair_counts_origin.sort_values('count', ascending=False).head(top_n)

plt.figure(figsize=(16, 8))
sns.barplot(
    data=top_pairs_origin,
    x='count',
    y=top_pairs_origin.apply(lambda x: f"{x['origin_country']} | {x['next_etablissement_postal']}", axis=1),
    palette='viridis'
)
plt.title('Top (Origin Country, Next Etablissement) Pairs by Receptacle Counts')
plt.xlabel('Number of Receptacles')
plt.ylabel('(Origin_Country | Next Etablissement)')
plt.tight_layout()
plt.show()


from the analysis we can see that there are some etablissments that get congested forming a sort of loop (ETAB0030, ETAB0002, ETAB0006)

### Time analysis regarding origin

### 1. parsed_packages_df

In [None]:
# Group the parsed_packages_df by date and origin_country to count packages per month per origin country
parsed_packages_df['date'] = pd.to_datetime(parsed_packages_df['date'])

# Group by origin_country and resample by month
ts_by_origin = parsed_packages_df.groupby(['origin_country', pd.Grouper(key='date', freq='ME')])['MAILITM_FID'].count().reset_index()

# Choose top 5 origin countries by total count for plotting
top_origin = parsed_packages_df['origin_country'].value_counts().head(5).index
plt.figure(figsize=(14, 7))

for origin in top_origin:
    ts = ts_by_origin[ts_by_origin['origin_country'] == origin]
    plt.plot(ts['date'], ts['MAILITM_FID'], marker='o', label=origin)

plt.title('Monthly Package count by Origin Country')
plt.xlabel('Date')
plt.ylabel('Number of Packages')
plt.legend(title='Origin Country')
plt.tight_layout()
plt.show()


### 2. parsed_receptacles_dfffff

In [None]:
# Group the parsed_receptacles_df by date and origin_country to count receptacles per month per origin country
parsed_receptacles_df['date'] = pd.to_datetime(parsed_receptacles_df['date'])

# Group by origin_country and resample by month
ts_by_origin_recept = parsed_receptacles_df.groupby(['origin_country', pd.Grouper(key='date', freq='ME')])['RECPTCL_FID'].count().reset_index()

# Choose top 5 origin countries by total count for plotting
top_origin_recept = parsed_receptacles_df['origin_country'].value_counts().head(5).index
plt.figure(figsize=(14, 7))

for origin in top_origin_recept:
    ts = ts_by_origin_recept[ts_by_origin_recept['origin_country'] == origin]
    plt.plot(ts['date'], ts['RECPTCL_FID'], marker='o', label=origin)

plt.title('Monthly Receptacle count by Origin Country')
plt.xlabel('Date')
plt.ylabel('Number of Receptacles')
plt.legend(title='Origin Country')
plt.tight_layout()
plt.show()


### Time analysis by destination

In [None]:
# Group the parsed_packages_df by date and destination_country to count packages per month per destination country
parsed_packages_df['date'] = pd.to_datetime(parsed_packages_df['date'])

# Group by destination_country and resample by month
ts_by_dest = parsed_packages_df.groupby(['destination_country', pd.Grouper(key='date', freq='ME')])['MAILITM_FID'].count().reset_index()

# Choose top 5 destination countries by total count for plotting
top_dest = parsed_packages_df['destination_country'].value_counts().head(5).index
plt.figure(figsize=(14, 7))

for dest in top_dest:
    ts = ts_by_dest[ts_by_dest['destination_country'] == dest]
    plt.plot(ts['date'], ts['MAILITM_FID'], marker='o', label=dest)

plt.title('Monthly Package count by Destination Country')
plt.xlabel('Date')
plt.ylabel('Number of Packages')
plt.legend(title='Destination Country')
plt.tight_layout()
plt.show()


### 2. parsed_receptacles_dffff

In [None]:
# Group the parsed_receptacles_df by date and destination_country to count receptacles per month per destination country
parsed_receptacles_df['date'] = pd.to_datetime(parsed_receptacles_df['date'])

# Group by destination_country and resample by month 
ts_by_dest_recept = parsed_receptacles_df.groupby(['destination_country', pd.Grouper(key='date', freq='ME')])['RECPTCL_FID'].count().reset_index()

# Choose top 5 destination countries by total count for plotting
top_dest_recept = parsed_receptacles_df['destination_country'].value_counts().head(5).index
plt.figure(figsize=(14, 7))

for dest in top_dest_recept:
    ts = ts_by_dest_recept[ts_by_dest_recept['destination_country'] == dest]
    plt.plot(ts['date'], ts['RECPTCL_FID'], marker='o', label=dest)

plt.title('Monthly Receptacle count by Destination Country')
plt.xlabel('Date')
plt.ylabel('Number of Receptacles')
plt.legend(title='Destination Country')
plt.tight_layout()
plt.show()


## Creating additional features 
 1. 'flow_type' column with values: 'inbound' (to DZ), 'outbound' (from DZ), 'local' (DZ to DZ), otherwise 'other'

In [None]:
import numpy as np

def get_flow_type(df):
    # Define the conditions
    conditions = [
        (df['destination_country'] == 'DZ') & (df['origin_country'] == 'DZ'), # local
        (df['destination_country'] == 'DZ'),                                # inbound
        (df['origin_country'] == 'DZ')                                     # outbound
    ]
    
    # Define the results for each condition
    choices = ['local', 'inbound', 'outbound']
    
    # Apply logic with 'other' as the default
    return np.select(conditions, choices, default='other')

# Apply to both DataFrames instantly
parsed_packages_df['flow_type'] = get_flow_type(parsed_packages_df)
parsed_receptacles_df['flow_type'] = get_flow_type(parsed_receptacles_df)

# Print counts
print("Flow type counts in parsed_packages_df:\n", parsed_packages_df['flow_type'].value_counts())
print("\nFlow type counts in parsed_receptacles_df:\n", parsed_receptacles_df['flow_type'].value_counts())

we can see that there are some values of flow type with the type "other"

In [None]:
# Display the first 10 rows of origin_country and destination_country for flow_type 'other' in parsed_receptacles_df
print(parsed_receptacles_df.loc[parsed_receptacles_df['flow_type'] == 'other', ['origin_country', 'destination_country']].head(10))



## Analysis of the relation between the flow_type and the event_type

## 1. parsed_packages_df

In [None]:

# Analyse relation between flow_type and EVENT_TYPE_CD in parsed_packages_df

plt.figure(figsize=(14,7))
ax = sns.countplot(data=parsed_packages_df, x="EVENT_TYPE_CD", hue="flow_type", palette='Set2')
plt.title("Distribution of EVENT_TYPE_CD by flow_type in parsed_packages_df")
plt.xlabel("EVENT_TYPE_CD")
plt.ylabel("Count")
plt.legend(title="Flow Type")
plt.tight_layout()
for c in ax.containers:
    ax.bar_label(c, label_type='edge', fontsize=8, padding=2)
plt.show()



## 2. parsed_receptacles_df

In [None]:
# For each flow_type, list the unique EVENT_TYPE_CD values and the most frequent EVENT_TYPE_CD value
for flow in parsed_receptacles_df['flow_type'].unique():
    event_types = parsed_receptacles_df.loc[parsed_receptacles_df['flow_type'] == flow, 'EVENT_TYPE_CD'].unique()
    most_common_event_type = parsed_receptacles_df.loc[parsed_receptacles_df['flow_type'] == flow, 'EVENT_TYPE_CD'].mode()
    print(f"Flow type: {flow}")
    print(f"EVENT_TYPE_CD values: {sorted(event_types)}")
    if not most_common_event_type.empty:
        print(f"Most frequent EVENT_TYPE_CD: {most_common_event_type.iloc[0]}")
    else:
        print("No EVENT_TYPE_CD available")
    print()   


In [None]:
plt.figure(figsize=(14,7))
ax = sns.countplot(data=parsed_receptacles_df, x="EVENT_TYPE_CD", hue="flow_type", palette='Set1')
plt.title("Distribution of EVENT_TYPE_CD by flow_type in parsed_receptacles_df")
plt.xlabel("EVENT_TYPE_CD")
plt.ylabel("Count")
plt.legend(title="Flow Type")
plt.tight_layout()

for c in ax.containers:
    ax.bar_label(c, label_type='edge', fontsize=8, padding=2)

plt.show()


we can see that there are major event types related to the inbound flow type( coming to DZ)

# Track multiple receptacles just to see the flow of events

In [None]:
# Track multiple receptacles: visualize all events for 5 different RECPTCL_FID in parsed_receptacles_df
# Pick 5 unique RECPTCL_FID values to demonstrate
num_examples = 5
example_receptacle_ids = parsed_receptacles_df['RECPTCL_FID'].drop_duplicates().iloc[:num_examples]
for rid in example_receptacle_ids:
    print("\n--- Events for RECPTCL_FID:", rid, "---")
    display(parsed_receptacles_df[parsed_receptacles_df['RECPTCL_FID'] == rid][['RECPTCL_FID', 'date', 'EVENT_TYPE_CD', 'etablissement_postal', 'next_etablissement_postal']].sort_values('date').reset_index(drop=True))


plt.figure(figsize=(14, 5))

for i, receptacle_id in enumerate(example_receptacle_ids):
    ex_df = parsed_receptacles_df[parsed_receptacles_df['RECPTCL_FID'] == receptacle_id].sort_values('date')
    plt.plot(
        ex_df['date'],
        ex_df['EVENT_TYPE_CD'],
        marker='o',
        label=f"RECPTCL_FID: {receptacle_id}"
    )

plt.xticks(rotation=45)
plt.title(f"Event Timeline (EVENT_TYPE_CD) for {num_examples} Receptacles")
plt.xlabel("Date")
plt.ylabel("EVENT_TYPE_CD")
plt.tight_layout()
plt.legend()
plt.show()


In [None]:
packages_df=parsed_packages_df.copy()
receptacles_df=parsed_receptacles_df.copy()

- we need to delete the first record of each package to not take into consideration the time it was inside its receptacle since this time would be calculated separatly

In [None]:
#delete the first row for each package 
packages_df = packages_df.sort_values(['MAILITM_FID', 'date'])
packages_df = packages_df.groupby('MAILITM_FID').apply(lambda x: x.iloc[1:]).reset_index(drop=True)
packages_df.info()

In [None]:
# find the last row for each receptacle based on date
last_rows = receptacles_df.loc[receptacles_df.groupby('RECPTCL_FID')['date'].idxmax()]
# print unique last rows based on origin_destination
last_rows = last_rows[last_rows['origin_destination']=='FR_DZ']
last_rows.head(20)
last_rows.value_counts('etablissement_postal')

# **ATTENTION**

* ... **Add feature engineering starting from here, as the next part is the splitting part, and it needs to be done to the whole datasets** ...

In [None]:
packages_df.head(30)

### Adding the target `delay`
* In the following cell, we're adding the target `delay` for each row.
* for rows (of the same package/receptacle) having `next_etablissement_postal` different than `etablissement_postal` of their next row, we keep the value of `delay` NaN.
* otherwise, we get the difference of `date` of the row and its next (of the same package/receptacle) in `hours` and store it in `delay`.

In [None]:
def add_target(df):

    # 1. Sort the entire dataset chronologically
    df = df.sort_values(by=['MAILITM_FID', 'date'], ascending=True)

    # ... ADD THE DELAY TARGET ...

    df['next_event_date'] = df.groupby('MAILITM_FID')['date'].shift(-1)
    df['delay'] = (df['next_event_date'] - df['date']).dt.total_seconds() / 3600
    # Create logical consistency mask
    df['next_row_etab'] = df.groupby('MAILITM_FID')['etablissement_postal'].shift(-1)
    valid_delay_mask = (df['next_etablissement_postal'] == df['next_row_etab'])

    df.loc[~valid_delay_mask, 'delay'] = np.nan
    df = df.drop(columns=['next_event_date', 'next_row_etab']) # to keep only to the original features.

    return df



In [None]:
packages_df = add_target(packages_df)
#receptacles_df = add_target(receptacles_df)

In [None]:
large_delay_pkg = packages_df[packages_df['delay'] > 1000]
large_delay_pkg.sort_values('delay', ascending=False).head(20)

# Splitting

* Splitting `packages` and dataset into training and testing sets using `train_test_split`

In [None]:
from sklearn.model_selection import train_test_split
packages_df = packages_df.sort_values(by=['date'])
pkg_X_train, pkg_X_test, = train_test_split(
    packages_df,
    test_size=0.2, 
    shuffle=False
)

print (f"Training set size: {pkg_X_train.shape[0]} rows")
print (f"Testing set size: {pkg_X_test.shape[0]} rows")

### Handling NaN values of `delay`

In [None]:
pkg_X_train['delay'].isna().sum()


In [None]:
pkg_X_test['delay'].isna().sum()

* Splitting `receptacles` and dataset into training and testing sets using `train_test_split`

In [None]:
# from sklearn.model_selection import train_test_split

# # Step 1: Train/Test split (80/20)
# rcp_X_train, rcp_X_test, = train_test_split(
#     receptacles_df,
#     test_size=0.2, 
#     random_state=42,
#     shuffle=True
# )

# print (f"Training set size: {rcp_X_train.shape[0]} rows")
# print (f"Testing set size: {rcp_X_test.shape[0]} rows")

In [None]:
def get_etab_mapping(df):
    # 1. Calculate the global mode once (to use as a safe fallback)
    global_mode = df['next_etablissement_postal'].mode().iat[0]

    # 2. Get the mode for every group at once
    # This creates a Series where index = etablissement, value = most frequent next
    modes_per_group = df.groupby('etablissement_postal')['next_etablissement_postal'].agg(
        lambda x: x.mode().iat[0] if not x.mode().empty else global_mode
    )

    # 3. Convert to dictionary
    etablissement_dict = modes_per_group.to_dict()
    return etablissement_dict

* After splitting, for each pair `(X_train, X_test)` of `packages_splits`, we'll fill the null values of `next_etablissement_postal` based on the most frequent value of `next_etablissement_postal` appearing with the value of `etablissement_postal` of each specific row having a null value in `next_etablissement_postal`
* Example:
say that a row has a null `next_etablissement_postal`, we look at it's `etablissement_postal` value (say `v`), we iterate through the training set, we count how many times each `ETAB_XXXX` in ` next_etablissement_postal` appears with `v` being in `etablissement_postal`, we take the mode, and we use it fill all rows having null value at `next_etablissement_postal` where their `etablissement_postal` is `v`

In [None]:
pkg_etablissement_dict = get_etab_mapping(pkg_X_train)
# rcp_etablissement_dict = get_etab_mapping(rcp_X_train)

* fill remaining null values of `next_etablissement_postal` with most frequent values of the training set only, avoiding *data leakage*

In [None]:
null_mask = pkg_X_train['next_etablissement_postal'].isna()

# Apply to original dataframe directly
pkg_X_train.loc[null_mask, 'next_etablissement_postal'] = (
    pkg_X_train.loc[null_mask, 'etablissement_postal'].map(pkg_etablissement_dict)
)

pkg_X_train['next_etablissement_postal'].isna().sum()

In [None]:
null_mask = pkg_X_test['next_etablissement_postal'].isna()

# Apply to original dataframe directly
pkg_X_test.loc[null_mask, 'next_etablissement_postal'] = (
    pkg_X_test.loc[null_mask, 'etablissement_postal'].map(pkg_etablissement_dict)
)

pkg_X_test['next_etablissement_postal'].isna().sum()


* 11 rows still have NaN `next_etablissement_postal` because some values in the test set didn't exist in the training set, therefore they didn't find the right etablissement to map to
* we'll fill them with `Unknown`

In [None]:
pkg_X_test['next_etablissement_postal'].fillna('Unknown', inplace=True)

- delete the rows with null delay(this will delete the rows with unknow next etablissment values)

In [None]:
#drop rows with NaN delay
pkg_X_train = pkg_X_train[~pkg_X_train['delay'].isna()]
pkg_X_train['delay'].isna().sum()

In [None]:
#drop rows with NaN delay
pkg_X_test = pkg_X_test[~pkg_X_test['delay'].isna()]
pkg_X_test['delay'].isna().sum()

* apply to `receptacles` dataset

In [None]:
# null_mask = rcp_X_train['next_etablissement_postal'].isna()

# # Apply to original dataframe directly
# rcp_X_train.loc[null_mask, 'next_etablissement_postal'] = (
#     rcp_X_train.loc[null_mask, 'etablissement_postal'].map(rcp_etablissement_dict)
# )

# rcp_X_train['next_etablissement_postal'].isna().sum()

In [None]:
# null_mask = rcp_X_test['next_etablissement_postal'].isna()

# # Apply to original dataframe directly
# rcp_X_test.loc[null_mask, 'next_etablissement_postal'] = (
#     rcp_X_test.loc[null_mask, 'etablissement_postal'].map(rcp_etablissement_dict)
# )

# rcp_X_test['next_etablissement_postal'].isna().sum()

# New features 
  - Added here to avoid data leakage

- This is the etablissement congestion durung the 4h hours window(subject to modification to see performance change)

In [None]:
def calculate_etab_load_1h(df):
    # 1. Store the original order/index
    df = df.copy()
    df['original_index'] = df.index 
    
    # 2. Sort by etab and date for the rolling calculation
    # We keep the unique original_index to map values back correctly
    df_sorted = df.sort_values(['etablissement_postal', 'date'])
    
    # 3. Calculate rolling count
    # We use 'date' as the window, but we keep it in the index alongside the original_index
    rolling_series = (
        df_sorted.set_index('date')
        .groupby('etablissement_postal')['MAILITM_FID']
        .rolling('1h', closed='left')
        .count()
    )
    
    # 4. Map it back safely
    # We reset the index of rolling_series to get a flat dataframe
    # Then we align it back to the original dataframe
    rolling_df = rolling_series.reset_index()
    
    # Since rolling and groupby can reorder rows, we merge on 
    # the specific columns to ensure every package gets its correct count
    # To handle duplicates, we add a temporary 'sequence' within each millisecond
    df_sorted['temp_seq'] = df_sorted.groupby(['etablissement_postal', 'date']).cumcount()
    rolling_df['temp_seq'] = rolling_df.groupby(['etablissement_postal', 'date']).cumcount()
    
    df_final = pd.merge(
        df_sorted, 
        rolling_df.rename(columns={'MAILITM_FID': 'etab_load_1h'}),
        on=['etablissement_postal', 'date', 'temp_seq'],
        how='left'
    )
    df_final['etab_load_1h'] = df_final['etab_load_1h'].fillna(0)
    
    return df_final.drop(columns=['temp_seq', 'original_index'])

In [None]:
pkg_X_train = calculate_etab_load_1h(pkg_X_train)
pkg_X_test = calculate_etab_load_1h(pkg_X_test)

- This is the route load in past 4h (we can modify the time window to see if there would be improvments)

In [None]:
def calculate_route_load_1h(df):
    df = df.copy()
    
    # 1. Sort by route and date
    # We use next_etablissement_postal which you've already filled
    df_sorted = df.sort_values(['etablissement_postal', 'next_etablissement_postal', 'date'])
    
    # 2. Calculate rolling count for the specific LANE
    rolling_series = (
        df_sorted.set_index('date')
        .groupby(['etablissement_postal', 'next_etablissement_postal'])['MAILITM_FID']
        .rolling('1h', closed='left')
        .count()
    )
    
    # 3. Flatten and prepare for merge
    rolling_df = rolling_series.reset_index()
    
    # Handle duplicates with sequence counts
    df_sorted['temp_seq'] = df_sorted.groupby(['etablissement_postal', 'next_etablissement_postal', 'date']).cumcount()
    rolling_df['temp_seq'] = rolling_df.groupby(['etablissement_postal', 'next_etablissement_postal', 'date']).cumcount()
    
    # 4. Merge back
    df_final = pd.merge(
        df_sorted, 
        rolling_df.rename(columns={'MAILITM_FID': 'route_load_1h'}),
        on=['etablissement_postal', 'next_etablissement_postal', 'date', 'temp_seq'],
        how='left'
    )
    
    # Fill NaNs with 0 (No other packages on this lane in the last 1h)
    df_final['route_load_1h'] = df_final['route_load_1h'].fillna(0)
    
    return df_final.drop(columns=['temp_seq'])

In [None]:
pkg_X_train= calculate_route_load_1h(pkg_X_train)
pkg_X_test= calculate_route_load_1h(pkg_X_test)

- Adding the time since last scan feature

In [None]:
def add_time_since_last_scan(df):
    # Ensure we are looking at the same package in the right order
    df = df.sort_values(['MAILITM_FID', 'date'])
    
    # 1. TIME SINCE LAST SCAN 
    # This is NOT cumulative. 
    df['time_since_last_scan'] = df.groupby('MAILITM_FID')['date'].diff().dt.total_seconds() / 3600
    
    df['time_since_last_scan'] = df['time_since_last_scan'].fillna(0)
    return df

# Apply
pkg_X_train = add_time_since_last_scan(pkg_X_train)
pkg_X_test = add_time_since_last_scan(pkg_X_test)

In [None]:
pkg_X_train['month'] = pkg_X_train['date'].dt.month
pkg_X_test['month'] = pkg_X_test['date'].dt.month

In [None]:
# for df in [pkg_X_train, pkg_X_test]:
#     # 4 = Friday, 5 = Saturday
#     df['is_weekend'] = df['date'].dt.dayofweek.isin([4, 5]).astype(int)
#     df['first_last_week_day']= df['date'].dt.dayofweek.isin([3,6]).astype(int)

In [None]:
# pkg_X_train['country_service'] = pkg_X_train['origin_country'].astype(str) + "_" + pkg_X_train['service_indicator'].astype(str)
# pkg_X_test['country_service'] = pkg_X_test['origin_country'].astype(str) + "_" + pkg_X_test['service_indicator'].astype(str)

- Adding features that represents events that could affect the flow of packages in algeria

In [None]:
import holidays

# 1. Setup Algeria Holidays (Keep this outside the function for speed)
al_holidays = holidays.Algeria(years=[2023, 2024, 2025, 2026])
holiday_dates = sorted(al_holidays.keys())
holiday_df = pd.DataFrame({'holiday_date': pd.to_datetime(holiday_dates)})

def add_holidays_features(df):
    # Ensure Date is datetime and SORTED (merge_asof requires sorting)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').copy() # .copy() avoids SettingWithCopy warnings

    # 3. Calculate "Days Since Last Holiday"
    # direction='backward' looks for the last holiday <= current date
    df = pd.merge_asof(df, holiday_df, left_on='date', right_on='holiday_date', direction='backward')
    df['days_since_last_holiday'] = (df['date'] - df['holiday_date']).dt.days
    df = df.drop(columns=['holiday_date']) # Drop it so the next merge doesn't conflict

    # 4. Calculate "Days Until Next Holiday"
    # direction='forward' looks for the next holiday >= current date
    df = pd.merge_asof(df, holiday_df, left_on='date', right_on='holiday_date', direction='forward')
    df['days_until_next_holiday'] = (df['holiday_date'] - df['date']).dt.days
    df = df.drop(columns=['holiday_date'])

    # 5. Clean up and Cap
    df[['days_since_last_holiday', 'days_until_next_holiday']] = df[['days_since_last_holiday', 'days_until_next_holiday']].fillna(30)
    
    df['days_since_last_holiday'] = df['days_since_last_holiday'].clip(upper=30)
    df['days_until_next_holiday'] = df['days_until_next_holiday'].clip(upper=30)

    return df

In [None]:
pkg_X_train = add_holidays_features(pkg_X_train)
pkg_X_test = add_holidays_features(pkg_X_test)

In [None]:
pkg_X_train.head(10)

In [None]:
#large packages delays
large_delay_pkg = pkg_X_train[pkg_X_train['delay'] > 100]
large_delay_pkg.sort_values('delay', ascending=False).head(30)

# Use CatBoost

### Using CatBoost with RMSE as the loss function
- parameters are manually defined

In [None]:
import numpy as np
from catboost import CatBoostRegressor, Pool, cv
pkg_X_train = pkg_X_train.sort_values('date').reset_index(drop=True)
# --- Define Model Parameters and Features ---
TARGET_COL = 'delay'

# List of categorical features
cat_features = [
    'etablissement_postal',
    'next_etablissement_postal',
    'service_indicator',
    'day_of_week',
    'origin_destination',
    'EVENT_TYPE_CD',
    'hour',
    'month',
    'is_weekend',
    'first_last_week_day',
    'country_service',

    # Add any other categorical columns here
]

# Columns to drop from features (IDs, date used for sorting, and the target)
drop_cols = [TARGET_COL, 'MAILITM_FID', 'RECPTCL_FID', 'date', 'serial_number','flow_type','country_code','destination_country','origin_country']

# Prepare the full 80% training set features and target
X_train_full = pkg_X_train.drop(columns=[c for c in drop_cols if c in pkg_X_train.columns])
y_train_full = pkg_X_train[TARGET_COL]

# Identify categorical features by name
categorical_feature_names = [c for c in cat_features if c in X_train_full.columns]

# --- TimeSeries Cross-Validation to Find Optimal Iterations ---

# Create the training Pool
train_pool = Pool(
    data=X_train_full,
    label=y_train_full,
    cat_features=categorical_feature_names,
)

cv_params = {
    'loss_function': 'RMSE',
    'iterations': 3000,         # Increase this significantly
    'learning_rate': 0.015,      # Lower this (from 0.05)
    'depth': 10,                 
    'l2_leaf_reg': 3,           # Standard regularization
    'random_seed': 42,
    'verbose': 0,
    'early_stopping_rounds': 100 # Stop if no improvement for 100 rounds
}

print("Starting TimeSeries Cross-Validation (5 Folds) to find optimal tree count...")

cv_results = cv(
    params=cv_params,
    pool=train_pool,
    fold_count=5,
    shuffle=False,               # CRITICAL: Ensures chronological order
    type='TimeSeries',           # Uses the rolling window strategy
)


# Find the best iteration based on the minimum average RMSE
best_iter = cv_results['test-RMSE-mean'].values.argmin()
best_rmse = cv_results['test-RMSE-mean'].min()
print(f"Optimal number of CatBoost iterations: {best_iter + 1} (Best CV RMSE: {best_rmse:.4f})")

# --- Train Final Model on ENTIRE Training Set ---

final_model = CatBoostRegressor(
    iterations=best_iter + 1,  # Use the optimal number of trees found in CV
    learning_rate=0.015,
    depth=10,
    loss_function='RMSE',
    random_seed=42,
    cat_features=categorical_feature_names,
    verbose=100 # Show training progress for the final model
)

print("\nTraining final model on the entire 80% training set...")
final_model.fit(train_pool)

# --- Predict and Evaluate on the Future Test Set ---

# Prepare the test features
X_test_features = pkg_X_test.drop(columns=[c for c in drop_cols if c in pkg_X_test.columns])
y_test_target = pkg_X_test[TARGET_COL] # The 'delay' column in pkg_X_test is the ground truth target

predictions = final_model.predict(X_test_features)



### Use Catboost with Huber loss function
- Huber with delta=20 hours act as MAE for the ones with less than 20 hours and act as RMSE for the ones greater than delta.
- This will solve the obssession of RMSE toward predicting the outliers(large values)
- parameters are set manually

In [None]:
import numpy as np
from catboost import CatBoostRegressor, Pool, cv

# 1. Ensure chronological order
pkg_X_train = pkg_X_train.sort_values('date').reset_index(drop=True)
pkg_X_test = pkg_X_test.sort_values('date').reset_index(drop=True)

# --- Define Model Parameters and Features ---
TARGET_COL = 'delay'

# List of categorical features (Kept country_service as it was helping!)
cat_features = [
    'etablissement_postal',
    'next_etablissement_postal',
    'day_of_week',
    'service_indicator',
    'origin_destination',
    'EVENT_TYPE_CD',
    'hour',
    'month',
    'is_weekend',
    'first_last_week_day',
    'country_service'
]

# Columns to drop
drop_cols = [TARGET_COL, 'MAILITM_FID', 'RECPTCL_FID', 'date', 'serial_number', 
             'flow_type', 'country_code', 'destination_country', 'origin_country']

# Prepare the full 80% training set
X_train_full = pkg_X_train.drop(columns=[c for c in drop_cols if c in pkg_X_train.columns])
y_train_full = pkg_X_train[TARGET_COL]

# Identify categorical features by name
categorical_feature_names = [c for c in cat_features if c in X_train_full.columns]

# --- TimeSeries Cross-Validation to Find Optimal Iterations ---

# Create the training Pool
train_pool = Pool(
    data=X_train_full,
    label=y_train_full,
    cat_features=categorical_feature_names,
)

cv_params = {
    'loss_function': 'Huber:delta=20.0',          # <--- CHANGED to Hurber(acts as MAE for normal ones but acts as RMSE for outliers)
    'eval_metric': 'MAE',            
    'iterations': 1500, 
    'learning_rate': 0.03,
    'depth': 8,                 
    'random_seed': 42,
    'verbose': 0,
    # 'early_stopping_rounds': 100,
    # 'l2_leaf_reg': 3,          # Standard regularization
}

print("Starting TimeSeries Cross-Validation (5 Folds) optimizing for MAE...")

cv_results = cv(
    params=cv_params,
    pool=train_pool,
    fold_count=5,
    shuffle=False, 
    type='TimeSeries', 
)

# Find the best iteration based on the minimum average MAE
# Note: cv_results keys change based on the loss_function used
best_iter = cv_results['test-MAE-mean'].values.argmin()
best_mae = cv_results['test-MAE-mean'].min()
print(f"Optimal number of CatBoost iterations: {best_iter + 1} (Best CV MAE: {best_mae:.4f})")

# --- Train Final Model on ENTIRE Training Set ---

final_model = CatBoostRegressor(
    iterations=best_iter + 1,
    learning_rate=0.03,
    depth=8,
    loss_function='Huber:delta=20.0',            
    eval_metric='MAE',              
    random_seed=42,
    cat_features=categorical_feature_names,
    verbose=100 
)

print("\nTraining final model on the entire 80% training set...")
final_model.fit(train_pool)

# --- Predict and Evaluate on the Future Test Set ---

X_test_features = pkg_X_test.drop(columns=[c for c in drop_cols if c in pkg_X_test.columns])
y_test_target = pkg_X_test[TARGET_COL]

predictions = final_model.predict(X_test_features)


# Using Optuna for hyperparameters tuning before passing them to CatBoost

### Preparing the data pool

In [None]:
from catboost import Pool

# ---  Data Preparation ---
pkg_X_train = pkg_X_train.sort_values('date').reset_index(drop=True)
pkg_X_test = pkg_X_test.sort_values('date').reset_index(drop=True)
TARGET_COL = 'delay'
cat_features = [
    'etablissement_postal', 'next_etablissement_postal', 'day_of_week',
    'service_indicator', 'origin_destination', 'EVENT_TYPE_CD', 'hour', 'month',
    'is_weekend', 'first_last_week_day', 'service_country'
]
drop_cols = [TARGET_COL, 'MAILITM_FID', 'RECPTCL_FID', 'date', 'serial_number', 
             'flow_type', 'country_code', 'destination_country', 'origin_country']

X_train_full = pkg_X_train.drop(columns=[c for c in drop_cols if c in pkg_X_train.columns])
y_train_full = pkg_X_train[TARGET_COL]
categorical_feature_names = [c for c in cat_features if c in X_train_full.columns]

train_pool = Pool(data=X_train_full, label=y_train_full, cat_features=categorical_feature_names)


### This will find the best parameters and saves them into a json file so they can be used later without the need to do the tuning again unless changes are made in the data or the features

In [None]:
import optuna
import json
from catboost import cv
def objective(trial):
    params = {
        'loss_function': 'Huber:delta=20.0',
        'eval_metric': 'MAE',
        'custom_metric': ['RMSE'], # Ensure RMSE is calculated
        'random_seed': 42,
        'verbose': 0,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 8),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'iterations': 1000, 
        'early_stopping_rounds': 30
    }
    
    cv_results = cv(
        params=params, 
        pool=train_pool, 
        fold_count=3, 
        shuffle=False, 
        type='TimeSeries',
        plot=False
    )
    
    # Track both metrics
    best_mae = cv_results['test-MAE-mean'].min()
    best_rmse = cv_results['test-RMSE-mean'].min()
    
    return best_mae, best_rmse

# --- 2. Run Multi-Objective Optimization ---
print("Starting Hyperparameter Tuning (Tracking MAE & RMSE)...")
study = optuna.create_study(directions=['minimize', 'minimize'])
study.optimize(objective, n_trials=20) 

# --- 3. Pick the best trial and find optimal iterations ---
# We pick the trial with the lowest MAE 
best_trial = min(study.best_trials, key=lambda t: t.values[0])

print(f"\nBest Trial selected (MAE: {best_trial.values[0]:.4f}, RMSE: {best_trial.values[1]:.4f})")

# Find best iterations for the winning set on 5 folds for stability
final_params_temp = {
    'loss_function': 'Huber:delta=20.0', 
    'eval_metric': 'MAE', 
    **best_trial.params
}

print("Calculating final optimal iterations...")
final_cv = cv(params=final_params_temp, pool=train_pool, fold_count=5, shuffle=False, type='TimeSeries', verbose=0)
best_iteration = int(final_cv['test-MAE-mean'].values.argmin() + 1)

# --- 4. Save to JSON ---
best_config = {
    **best_trial.params,
    "iterations": best_iteration,
    "loss_function": 'Huber:delta=20.0',
    "eval_metric": 'MAE',
    "final_mae": best_trial.values[0],
    "final_rmse": best_trial.values[1]
}

with open("catboost_best_params.json", "w") as f:
    json.dump(best_config, f, indent=4)

print(f"Tuning complete. Best parameters saved to 'catboost_best_params.json'")

### Load the parameters from the json file and use them for the final training and then test

In [None]:
import json
from catboost import CatBoostRegressor

# --- 1. Load Parameters ---
with open("catboost_best_params.json", "r") as f:
    best_config = json.load(f)

# Extract iterations and remove metrics/metadata so CatBoost doesn't crash
iters = best_config.pop("iterations")
best_config.pop("final_mae", None)   # Remove if present
best_config.pop("final_rmse", None)  # Remove if present

# --- 2. Initialize and Train Model ---
# Ensure categorical_feature_names and train_pool are defined in your environment
final_model = CatBoostRegressor(
    iterations=iters,
    **best_config,
    cat_features=categorical_feature_names,
    random_seed=42,
    verbose=100
)

print("Training final model with saved parameters...")
final_model.fit(train_pool)

# --- 3. Save the actual Model weights ---
final_model.save_model("final_delay_model.cbm")
print("Model trained and saved as 'final_delay_model.cbm'")

# --- 4. Predict on Test Set ---
# Ensure pkg_X_test and drop_cols are defined in your environment
X_test_features = pkg_X_test.drop(columns=[c for c in drop_cols if c in pkg_X_test.columns])
predictions = final_model.predict(X_test_features)


# Evaluation metrics

In [None]:
from sklearn.metrics import mean_absolute_error,root_mean_squared_error
y_test_true = pkg_X_test[TARGET_COL]
y_test_pred_subset = predictions
    
final_rmse = root_mean_squared_error(y_test_true, y_test_pred_subset)
final_mae = mean_absolute_error(y_test_true, y_test_pred_subset)

print(f"\nFinal Model Evaluation on Test Set:")
print(f"RMSE: {final_rmse:.4f} hours")
print(f"MAE: {final_mae:.4f} hours")

In [None]:
# --- Feature Importance Analysis ---
import pandas as pd
import matplotlib.pyplot as plt

# Get the feature importances from the trained model
feature_importances = final_model.get_feature_importance()
feature_names = X_train_full.columns

# Create a Series for easy sorting and handling
importance_series = pd.Series(feature_importances, index=feature_names)

# Sort the features by importance (descending)
sorted_importance = importance_series.sort_values(ascending=False)

print("\n--- Feature Importances (Top 10) ---")
print(sorted_importance.head(10))

# Optional: Plot the top 10 features
plt.figure(figsize=(10, 6))
sorted_importance.head(10).plot(kind='barh', color='skyblue')
plt.title('Top 10 CatBoost Feature Importances')
plt.xlabel('Importance Score')
plt.gca().invert_yaxis()
plt.show()

In [None]:
interactions = final_model.get_feature_importance(type='Interaction', prettified=True)
print("\nTop Feature Interactions:")
print(interactions.head(10))

In [None]:
print("\n--- Target Variable Comparison ---")
print("Training Delay Statistics:")
print(pkg_X_train['delay'].describe())

print("\nTesting Delay Statistics:")
print(pkg_X_test['delay'].describe())

The 7800-hour max in the training set and the 6024-hour max in the test set are still extremely high. These current outliers, combined with the high mean delay (~50 hours), mean that the target variable (delay) is still heavily right-skewed.
The CatBoost model is predicting the average of the distribution, and the tail (the very long delays) is pulling the final RMSE score up dramatically.

### Analysis of the errors

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate residuals
residuals = y_test_target- predictions

plt.figure(figsize=(10, 6))
sns.scatterplot(x=predictions, y=residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residual Plot: Prediction vs. Error')
plt.xlabel('Predicted Delay (Hours)')
plt.ylabel('Residuals (Actual - Predicted)')
plt.show()

In [None]:
# 1. Calculate the raw error (not absolute)
results = pd.DataFrame({
    'Actual': y_test_target,
    'Predicted': predictions
})
results['Residual'] = results['Predicted'] - results['Actual']

# 2. Filter for large errors (greater than 50 hours)
big_errors = results[np.abs(results['Residual']) > 50]

# 3. Count which is more common
missed_delays = big_errors[big_errors['Residual'] < 0].shape[0]
model_hallucinations = big_errors[big_errors['Residual'] > 0].shape[0]

print(f"Total Large Errors (>50h): {len(big_errors)}")
print(f"---")
print(f"Missed Delays (Under-predicted): {missed_delays}")
print(f"Model Hallucinations (Over-predicted): {model_hallucinations}")