In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load dataset using pandas
packages_df = pd.read_csv('../data/interim/interim_parsed_packages_df.csv', delimiter=',')
receptacle_df = pd.read_csv('../data/interim/interim_parsed_receptacle_df.csv', delimiter=',')

In [None]:
#see the percentage of missing values in each column
packages_df.isnull().mean() * 100

In [None]:
#drop rows with missing etablissement_postal since it has very few missing values
packages_df = packages_df.dropna(subset=['etablissement_postal'])

In [None]:
#drop unnecessary columns
packages_df = packages_df.drop(
    columns=['country_code', 'serial_number', 'RECPTCL_FID', 'MAILITM_FID', 'date', 'country_code',
             'processing_duration', 'origin_country', 'destination_country'])

In [None]:
packages_df.head()

In [None]:
#split data into train and test sets
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(packages_df, test_size=0.2, random_state=42)

In [None]:
#seperate features and target variable
X_train = train_df.drop('processing_duration_days', axis=1)
y_train = train_df['processing_duration_days']
X_test = test_df.drop('processing_duration_days', axis=1)
y_test = test_df['processing_duration_days']

In [None]:
#see the percentage of missing values in each column
X_train.isnull().mean() * 100

# investigate missing values in next_etablissement_postal

In [None]:
#we investigate the reason behind missing values in next_etablissement_postal
missing_next_postal = X_train[X_train['next_etablissement_postal'].isnull()]
missing_next_postal.shape

In [None]:
#see if there is a relation with day_of_week column
#visualize the distribution of day_of_week for missing next_etablissement_postal
plt.figure(figsize=(10, 6))
sns.countplot(x='day_of_week', data=missing_next_postal)
plt.title('Distribution of day_of_week for missing next_etablissement_postal')
plt.show()

all days seem to have the same distribution, so we cannot conclude anything from this

In [None]:
#see if EVENT_TYPE_CD has any relation with missing next_etablissement_postal
plt.figure(figsize=(12, 6))
sns.countplot(x='EVENT_TYPE_CD', data=missing_next_postal,
              order=missing_next_postal['EVENT_TYPE_CD'].value_counts().index)
plt.title('Distribution of EVENT_TYPE_CD for missing next_etablissement_postal')
plt.xticks(rotation=90)
plt.show()

we calculate the percentage of null values explained by the top 5 highest contributing EVENT_TYPE_CD
to the null values in next_etablissement_postal

In [None]:
top_5_event_types = missing_next_postal['EVENT_TYPE_CD'].value_counts().head(5)
top_5_percentage = (top_5_event_types.sum() / missing_next_postal.shape[0]) * 100
top_5_percentage

we find that the top 5 EVENT_TYPE_CD contribute to around 96% of the missing values in next_etablissement_postal

we see if origin_destination has any relation with missing next_etablissement_postal

In [None]:
#see if origin_destination has any relation with missing next_etablissement_postal
plt.figure(figsize=(12, 6))
sns.countplot(x='origin_destination', data=missing_next_postal,
              order=missing_next_postal['origin_destination'].value_counts().index)
plt.title('Distribution of origin_destination for missing next_etablissement_postal')
plt.xticks(rotation=90)
plt.show()

In [None]:
top_5_origin_destination = missing_next_postal['origin_destination'].value_counts().head(5)
top_5_percentage_origin_destination = (top_5_origin_destination.sum() / missing_next_postal.shape[0]) * 100
top_5_percentage_origin_destination

the top 5 origin_destination contribute to around 83% of the missing values in next_etablissement_postal

investigate if service_indicator has any relation with missing next_etablissement_postal

In [None]:
#see if service_indicator has any relation with missing next_etablissement_postal
plt.figure(figsize=(12, 6))
sns.countplot(x='service_indicator', data=missing_next_postal,
              order=missing_next_postal['service_indicator'].value_counts().index)
plt.title('Distribution of service_indicator for missing next_etablissement_postal')
plt.xticks(rotation=90)
plt.show()

it seems that service_indicator has a strong effect on missing next_etablissement_postal

we find that EVENT_TYPE_CD, origin_destination and service_indicator have strong relation with missing next_etablissement_postal,certain values for these columns contribute to most of the missing values in next_etablissement_postal

# further domains specific investigation to find the reason behind missing next_etablissement_postal, for now we will keep the missing values as is

# handle categorical features

In [None]:
#see the type of features
X_train.dtypes


In [None]:
categorical_features = ['etablissement_postal', 'next_etablissement_postal', 'EVENT_TYPE_CD',
                        'origin_destination', 'service_indicator', 'day_of_week','flow_type']
X_train[categorical_features].nunique()

we notice that some categorical features have high cardinality, using categorical encoding techniques like one hot encoding will lead to very high dimensional data