In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

### Load the clean merged data

In [None]:
##from data/interim/interim_merged load the data
merged_df = pd.read_csv( '../data/interim/interim_merged_packages_receptacle_df.csv', delimiter=',')

- print the 10 first rows 

In [None]:
#print the first 10 rows of merged_df
print(merged_df.head(10))

In [None]:
merged_df = merged_df.drop(columns=[
    'Unnamed: 0_package',
    'Unnamed: 0_receptacle',
    'RECPTCL_FID', 'MAILITM_FID', 'serial_number'
])

In [None]:

# Sort chronologically
merged_df = merged_df.sort_values('date_package').reset_index(drop=True)

In [None]:

df_clean = merged_df.copy()
# Convert date columns
date_cols = ['date_package', 'date_receptacle']
for col in date_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col])
        
# Extract temporal features
def extract_datetime_features(df, date_col):
    df[f'{date_col}_year'] = df[date_col].dt.year
    df[f'{date_col}_month'] = df[date_col].dt.month
    df[f'{date_col}_day'] = df[date_col].dt.day
    df[f'{date_col}_hour'] = df[date_col].dt.hour
    df[f'{date_col}_dayofweek'] = df[date_col].dt.dayofweek
    return df

for date_col in date_cols:
    if date_col in df_clean.columns:
        df_clean = extract_datetime_features(df_clean, date_col)

# Convert timedelta to numeric
duration_cols = ['processing_duration_package', 'processing_duration_receptacle']
for col in duration_cols:
    if col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            df_clean[col] = pd.to_timedelta(df_clean[col]).dt.total_seconds()

- for the service indicator remeber to do the mapping for all existing ones

In [None]:


# Copy cleaned dataframe
df_features = df_clean.copy()

# 1. Route consistency features
df_features['same_origin_destination'] = (
    df_features['origin_destination_package'] == df_features['origin_destination_receptacle']
).astype(int)

# 2. Service type mapping (S10-12)
service_mapping = {
    # Domestic / private use
    'AP': 8, 'AR': 8, 'AS': 8,
    # Registered Letter Post (Priority 2)
    'BC': 2,
    # Parcel Post (Standard)
    'CA': 5,
    # Lower-priority Parcel / Letter Goods
    'CB': 4, 'CC': 4, 'CD': 4, 'CE': 4, 'CF': 4, 'CG': 4, 'CH': 4,
    # Parcel Post insured or special
    'CI': 6, 'CJ': 6, 'CK': 6, 'CL': 6, 'CM': 6, 'CN': 6, 'CO': 6, 'CP': 6, 
    'CQ': 6, 'CR': 6, 'CS': 6, 'CU': 6, 'CV': 6, 'CX': 6, 'CY': 6,
    # Tracked Letter Post (Priority 3)
    'LA': 3, 'LB': 3, 'LD': 3, 'LE': 3, 'LF': 3, 'LG': 3, 'LH': 3, 'LI': 3, 
    'LJ': 3, 'LK': 3, 'LL': 3, 'LM': 3, 'LN': 3, 'LP': 3, 'LR': 3, 'LS': 3, 
    'LV': 3, 'LW': 3, 'LX': 3, 'LY': 3, 'LZ': 3
}
df_features['service_code_package'] = df_features['service_indicator'].map(service_mapping)

# 4. Geographic features
df_features['is_international'] = df_features['origin_country_package'] != df_features['destination_country_package']

# 5. Frequency encoding for flow_type and etablissement_postal columns
freq_encode_cols = [
    'flow_type_package', 'flow_type_receptacle',
    'etablissement_postal_package', 'next_etablissement_postal_package',
    'etablissement_postal_receptacle', 'next_etablissement_postal_receptacle', 'EVENT_TYPE_CD_package', 'EVENT_TYPE_CD_receptacle'
]

for col in freq_encode_cols:
    freq = df_features[col].value_counts(normalize=True)
    df_features[col + '_freq'] = df_features[col].map(freq)

# 6. Select features for clustering
cluster_features = [
    'service_code_package',
    'same_origin_destination',
    'num_etablissements_package',
    'num_etablissements_receptacle',
    'processing_duration_package',
    'processing_duration_receptacle',
    'is_international'
]

# Add frequency-encoded features
for col in freq_encode_cols:
    cluster_features.append(col + '_freq')

# Final feature matrix
X = df_features[cluster_features].fillna(0)

# Optional: scale features for clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Clustering algo

## 1 K-mean

In [None]:
# Determine optimal number of clusters
from sklearn.metrics import silhouette_score

silhouette_scores = []
k_range = range(2, 10)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    silhouette_scores.append(silhouette_score(X, labels))

# Plot elbow method
plt.figure(figsize=(10, 6))
plt.plot(k_range, silhouette_scores, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Optimal Number of Clusters')
plt.show()

# Apply K-Means with optimal k
optimal_k = k_range[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df_features['cluster_kmeans'] = kmeans.fit_predict(X)

# Analyze clusters
cluster_summary = df_features.groupby('cluster_kmeans').agg({
    'same_origin_destination': 'mean',
    'processing_duration_package': 'mean',
    'num_etablissements_package': 'mean'
})
print("Cluster Summary:")
print(cluster_summary)

## 2. DBScan

In [None]:
from sklearn.neighbors import NearestNeighbors

# Find optimal eps
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(X)
distances, indices = neighbors_fit.kneighbors(X)
distances = np.sort(distances[:, -1], axis=0)

plt.figure(figsize=(10, 6))
plt.plot(distances)
plt.xlabel('Points sorted by distance')
plt.ylabel('5th Nearest Neighbor Distance')
plt.title('K-Distance Graph for DBSCAN')
plt.show()

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
df_features['cluster_dbscan'] = dbscan.fit_predict(X)

# Analyze DBSCAN results
print(f"Number of clusters found: {len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)}")
print(f"Number of outliers: {sum(dbscan.labels_ == -1)}")

# Anomaly detection

In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Isolation Forest
iso_forest = IsolationForest(
    contamination=0.05,  # expected proportion of outliers
    random_state=42,
    n_estimators=100
)

df_features['anomaly_score'] = iso_forest.fit_predict(X_scaled)
df_features['is_anomaly'] = df_features['anomaly_score'] == -1

# Analyze anomalies
anomalies = df_features[df_features['is_anomaly']]
print(f"Detected {len(anomalies)} anomalies")
print("\nAnomaly characteristics:")
print(anomalies[cluster_features].describe())

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(
    contamination=0.05,
    novelty=False
)

df_features['lof_score'] = lof.fit_predict(X_scaled)
df_features['is_lof_anomaly'] = df_features['lof_score'] == -1

# Compare different anomaly detection methods
anomaly_comparison = df_features.groupby(['is_anomaly', 'is_lof_anomaly']).size().unstack()
print("Anomaly Detection Comparison:")
print(anomaly_comparison)

In [None]:
# 2D Visualization using PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(15, 5))

# Plot 1: Clusters
plt.subplot(131)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], 
                      c=df_features['cluster_kmeans'], 
                      cmap='viridis', alpha=0.6)
plt.colorbar(scatter)
plt.title('K-Means Clusters')

# Plot 2: Anomalies
plt.subplot(132)
colors = ['blue' if not x else 'red' for x in df_features['is_anomaly']]
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.6)
plt.title('Anomalies (Isolation Forest)')



In [None]:
def generate_insights(df_features):
    insights = []
    
    #  Unusual processing times
    anomalies_processing = df_features[df_features['processing_duration_package'] > 15 * 24 * 60 * 60]  # 15 days in seconds
    if len(anomalies_processing) > 0:
        insights.append(f"Found {len(anomalies_processing)} items with unusually long processing times")
    

    #  Route inconsistencies
    route_inconsistencies = df_features[df_features['same_origin_destination'] == 0]
    if len(route_inconsistencies) > 0:
        insights.append(f"Found {len(route_inconsistencies)} cases with mismatched package-receptacle routes")
    
    #  Establishment patterns
    establishment_counts = df_features.groupby('etablissement_postal_package').size()
    unusual_establishments = establishment_counts[establishment_counts < 3]  # rarely used establishments
    if len(unusual_establishments) > 0:
        insights.append(f"Found {len(unusual_establishments)} rarely used postal establishments")
    
    return insights

insights = generate_insights(df_features)
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")