In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

### Load Merged Data

In [None]:
#used clean one
merged_df = pd.read_csv( '../data/interim/interim_merged_packages_receptacle_df.csv', delimiter=',')

### Sort Data Chronologically



In [None]:

# Sort chronologically
merged_df = merged_df.sort_values('date_package').reset_index(drop=True)


### Remove unnecessary index columns that were created during the merge operation.

In [None]:
merged_df = merged_df.drop(columns=[
    'Unnamed: 0_package',
    'Unnamed: 0_receptacle',
    #'RECPTCL_FID', 'MAILITM_FID', 'serial_number'
])



### Convert the date column to datetime format and extract temporal features:
- **hour**: Hour of day when package was processed
- **day_of_week**: Day of the week (Sunday -->Thursday)
- **is_weekend**: Binary flag indicating weekend (Friday/Saturday)

In [None]:
merged_df['date_package'] = pd.to_datetime(merged_df['date_package'])

merged_df['hour'] = merged_df['date_package'].dt.hour
merged_df['day_of_week'] = merged_df['date_package'].dt.dayofweek
merged_df['is_weekend'] = merged_df['day_of_week'].isin([4, 5]).astype(int)



### Create a binary feature indicating whether a package exceeded the 15-day processing threshold.

In [None]:
merged_df['delay_flag'] = (merged_df['processing_duration_days'] > 15).astype(int)



### Calculate the standardized delay score (z-score) for processing duration within each origin-destination pair. This normalizes delays relative to typical processing times for each route.

In [None]:
merged_df['delay_zscore'] = (
    merged_df
    .groupby('origin_destination_package')['processing_duration_days']
    .transform(lambda x: (x - x.mean()) / (x.std() + 1e-6))
)



### Normalize the processing delay by the number of establishments the package passed through. This accounts for packages that take longer routes.

In [None]:
merged_df['delay_per_etab'] = (
    merged_df['processing_duration_days'] /
    (merged_df['num_etablissements_package'] + 1)
)



### Create a route identifier by combining the current and next etablissment 

In [None]:
merged_df['pkg_route_step'] = (
    merged_df['etablissement_postal_package'] + '→' + merged_df['next_etablissement_postal_package']
)



### Calculate how common each package route is in the dataset

In [None]:
pkg_route_freq = merged_df['pkg_route_step'].value_counts(normalize=True)
merged_df['pkg_route_freq'] = merged_df['pkg_route_step'].map(pkg_route_freq)



### Calculate how frequently each postal establishment appears as the current or next location in the routing 

In [None]:
etab_freq = pd.concat([
    merged_df['etablissement_postal_package'],
    merged_df['next_etablissement_postal_package']
]).value_counts()

merged_df['current_etab_freq'] = merged_df['etablissement_postal_package'].map(etab_freq)
merged_df['next_etab_freq'] = merged_df['next_etablissement_postal_package'].map(etab_freq)


### Package-Level Features


In [None]:
pkg_features = [
    'processing_duration_days',
    'delay_flag',
    'delay_per_etab',
    'delay_zscore',
    'num_etablissements_package',
    'pkg_route_freq',
    'current_etab_freq',
    'next_etab_freq',
    'hour',
    'is_weekend'
]


## Receptacle Level

### receptacle route identifiers

In [None]:
merged_df['rec_route_step'] = (
    merged_df['etablissement_postal_receptacle'] + '→' + merged_df['next_etablissement_postal_receptacle']
)




### Aggregate package-level statistics at the receptacle level to create features such as:
- Number of packages in receptacle
- Average and standard deviation of processing duration
- Average delay metrics
- Average package route rarity

In [None]:
receptacle_route_stats = (
    merged_df
    .groupby('RECPTCL_FID')
    .agg(
        rec_route=('rec_route_step', 'first'),
        num_packages=('MAILITM_FID', 'count'),
        avg_processing_days=('processing_duration_days', 'mean'),
        std_processing_days=('processing_duration_days', 'std'),
        avg_delay_per_etab=('delay_per_etab', 'mean'),
        avg_pkg_route_rarity=('pkg_route_freq', 'mean')
    )
    .reset_index()
)

# Fill NaNs
receptacle_route_stats['std_processing_days'] = receptacle_route_stats['std_processing_days'].fillna(0)


### Receptacle Route Frequency

Calculate how common each receptacle route is

In [None]:
rec_route_freq = receptacle_route_stats['rec_route'].value_counts(normalize=True)
receptacle_route_stats['rec_route_freq'] = receptacle_route_stats['rec_route'].map(rec_route_freq)


### Receptacle Flow Type Frequency

Map flow type frequencies to each receptacle

In [None]:
rec_flow_type = merged_df.groupby('RECPTCL_FID')['flow_type_receptacle'].first().reset_index()

flow_type_freq = rec_flow_type['flow_type_receptacle'].value_counts(normalize=True)
rec_flow_type['flow_type_freq'] = rec_flow_type['flow_type_receptacle'].map(flow_type_freq)

receptacle_route_stats = receptacle_route_stats.merge(
    rec_flow_type[['RECPTCL_FID', 'flow_type_freq']],
    on='RECPTCL_FID',
    how='left'
)


### Receptacle Level Features

In [None]:
rec_features = [
    'num_packages',
    'avg_processing_days',
    'std_processing_days',
    'avg_delay_per_etab',
    'avg_pkg_route_rarity',
    'rec_route_freq',
    'flow_type_freq'
]
