# modules

In [1]:
import math
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from lazypredict.Supervised import LazyClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV



# utils

In [2]:
def haversine_formula(row):
    earth_radius_km = 6371.0
    lat1_rad = math.radians(row['lat1'])
    lat2_rad = math.radians(row['lat2'])
    dlat = lat2_rad - lat1_rad
    dlon = math.radians(row['lon2'] - row['lon1'])

    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance_km = earth_radius_km * c
    return distance_km

In [3]:
with open('./location.json', 'r') as f:
    location = json.load(f)

# data

## load

In [4]:
df_train = pd.read_csv('./data/train_data.csv')
df_test = pd.read_csv('./data/test.csv')

## explore

In [6]:
df_train.head(1)

Unnamed: 0,Tracking,expedition,tentative,livraison/échec,statut,ID expediteur,depart commune,depart wilaya,stop desk?,destination commune,destination wilaya,label
0,yal-ABC240400,2024-01-10 18:18:53,2024-01-12 16:46:52,2024-01-12 16:46:52,2024-01-12 16:46:52,expyal-11571,Oued Smar,Alger,oui,El Menia,Ghardaïa,Livré


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271481 entries, 0 to 271480
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Tracking             271481 non-null  object
 1   expedition           271481 non-null  object
 2   tentative            270360 non-null  object
 3   livraison/échec      270332 non-null  object
 4   statut               271481 non-null  object
 5   ID expediteur        271481 non-null  object
 6   depart commune       271481 non-null  object
 7   depart wilaya        271481 non-null  object
 8   stop desk?           271481 non-null  object
 9   destination commune  271481 non-null  object
 10  destination wilaya   271481 non-null  object
 11  label                271481 non-null  object
dtypes: object(12)
memory usage: 24.9+ MB


In [8]:
df_train.describe()

Unnamed: 0,Tracking,expedition,tentative,livraison/échec,statut,ID expediteur,depart commune,depart wilaya,stop desk?,destination commune,destination wilaya,label
count,271481,271481,270360,270332,271481,271481,271481,271481,271481,271481,271481,271481
unique,271481,42188,153353,194983,184807,15512,116,48,2,1365,48,12
top,yal-ABC240400,2024-01-08 15:10:17,2024-01-12 00:03:28,2024-01-15 11:54:57,2024-01-20 17:34:41,expyal-17226,Oued Smar,Alger,oui,Constantine,Alger,Livré
freq,1,354,30,18,703,17384,58347,115959,144903,9204,31955,236293


## preprocessing

### time

In [5]:
times = ['expedition', 'tentative', 'livraison/échec', 'statut']

for t in times:
    df_train[t] = pd.to_datetime(df_train[t])
    df_test[t] = pd.to_datetime(df_test[t])
    
    if t == 'expedition':
        df_train[t + '_year'] = pd.to_datetime(df_train[t]).dt.year
        df_test[t + '_year'] = pd.to_datetime(df_test[t]).dt.year
        df_train[t + '_month'] = pd.to_datetime(df_train[t]).dt.month
        df_test[t + '_month'] = pd.to_datetime(df_test[t]).dt.month
        df_train[t + '_day'] = pd.to_datetime(df_train[t]).dt.day
        df_test[t + '_day'] = pd.to_datetime(df_test[t]).dt.day
        df_train[t + '_hour'] = pd.to_datetime(df_train[t]).dt.hour
        df_test[t + '_hour'] = pd.to_datetime(df_test[t]).dt.hour
    else:
        df_train[t + '_duration'] = (df_train[t] - df_train['expedition']).dt.total_seconds() / (24*60*60)
        df_test[t + '_duration'] = (df_test[t] - df_test['expedition']).dt.total_seconds()  / (24*60*60)

### distance

In [6]:
df_train['lat1'] = df_train['depart wilaya'].map(lambda x: location[x]['lat'])
df_train['lat2'] = df_train['destination wilaya'].map(lambda x: location[x]['lat'])
df_train['lon1'] = df_train['depart wilaya'].map(lambda x: location[x]['lon'])
df_train['lon2'] = df_train['destination wilaya'].map(lambda x: location[x]['lon'])

df_test['lat1'] = df_test['depart wilaya'].map(lambda x: location[x]['lat'])
df_test['lat2'] = df_test['destination wilaya'].map(lambda x: location[x]['lat'])
df_test['lon1'] = df_test['depart wilaya'].map(lambda x: location[x]['lon'])
df_test['lon2'] = df_test['destination wilaya'].map(lambda x: location[x]['lon'])

df_train['distance'] = df_train.apply(haversine_formula, axis=1)
df_test['distance'] = df_test.apply(haversine_formula, axis=1)

### reduce

In [7]:
df_train.columns

Index(['Tracking', 'expedition', 'tentative', 'livraison/échec', 'statut',
       'ID expediteur', 'depart commune', 'depart wilaya', 'stop desk?',
       'destination commune', 'destination wilaya', 'label', 'expedition_year',
       'expedition_month', 'expedition_day', 'expedition_hour',
       'tentative_duration', 'livraison/échec_duration', 'statut_duration',
       'lat1', 'lat2', 'lon1', 'lon2', 'distance'],
      dtype='object')

In [8]:
features = ['ID expediteur', 'depart wilaya', 'destination wilaya', 'stop desk?',
       'expedition_month', 'expedition_day', 'expedition_hour',
       'tentative_duration', 'livraison/échec_duration', 'statut_duration',
       'distance']

In [9]:
df_train = df_train[features + ['label']]
df_test = df_test[features]

### wilaya

In [10]:
df_train['depart wilaya'] = df_train['depart wilaya'].map(lambda x: location[x]['pos'])
df_train['destination wilaya'] = df_train['destination wilaya'].map(lambda x: location[x]['pos'])

df_test['depart wilaya'] = df_test['depart wilaya'].map(lambda x: location[x]['pos'])
df_test['destination wilaya'] = df_test['destination wilaya'].map(lambda x: location[x]['pos'])

df_train_encoded = pd.get_dummies(df_train['depart wilaya'], prefix='depart_wilaya')
df_train = pd.concat([df_train, df_train_encoded], axis=1)
df_train = df_train.drop('depart wilaya', axis=1)

df_test_encoded = pd.get_dummies(df_test['depart wilaya'], prefix='depart_wilaya')
df_test = pd.concat([df_test, df_test_encoded], axis=1)
df_test = df_test.drop('depart wilaya', axis=1)

df_train_encoded = pd.get_dummies(df_train['destination wilaya'], prefix='destination_wilaya')
df_train = pd.concat([df_train, df_train_encoded], axis=1)
df_train = df_train.drop('destination wilaya', axis=1)

df_test_encoded = pd.get_dummies(df_test['destination wilaya'], prefix='destination_wilaya')
df_test = pd.concat([df_test, df_test_encoded], axis=1)
df_test = df_test.drop('destination wilaya', axis=1)

### expediteur

In [11]:
expediteurs = dict(df_train['ID expediteur'].value_counts())

df_train['ID expediteur'] = df_train['ID expediteur'].map(expediteurs)
df_test['ID expediteur'] = df_test['ID expediteur'].map(expediteurs)

df_test['ID expediteur'].fillna(0, inplace=True)

df_train.rename(columns={'ID expediteur': 'expediteur'}, inplace=True)
df_test.rename(columns={'ID expediteur': 'expediteur'}, inplace=True)

### stop desk

In [12]:
df_train['stop desk?'].replace({
    'oui': 1,
    'non': 0
}, inplace=True)

df_test['stop desk?'].replace({
    'oui': 1,
    'non': 0
}, inplace=True)

df_train.rename(columns={'stop desk?': 'stop_desk'}, inplace=True)
df_test.rename(columns={'stop desk?': 'stop_desk'}, inplace=True)

### cleaning

In [13]:
df_train['tentative_duration'].fillna(df_train['tentative_duration'].mean(), inplace=True)
df_train['livraison/échec_duration'].fillna(df_train['livraison/échec_duration'].mean(), inplace=True)

df_test['tentative_duration'].fillna(df_test['tentative_duration'].mean(), inplace=True)
df_test['livraison/échec_duration'].fillna(df_test['livraison/échec_duration'].mean(), inplace=True)

# X and Y

In [14]:
y = df_train.label
X = df_train.drop(columns=['label'])

X_test = df_test

In [15]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Training

In [36]:
xgb_classifier = XGBClassifier()

xgb_classifier.fit(X, y)

In [44]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(objective='multi:softmax', num_classes=12, eval_metric='mlogloss', random_state=42)

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform grid search using GridSearchCV
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=4, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

# Print the best parameters and corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

In [37]:
accuracy_score(y, xgb_classifier.predict(X))

0.9999152795223238

In [38]:
predictions = xgb_classifier.predict(X_test)

# submission

In [39]:
sub = pd.DataFrame({
    'Tracking': pd.read_csv('./data/test.csv')['Tracking'],
    'dernier statut': encoder.inverse_transform(predictions)
})

sub.to_csv('submission.csv', index=False)

In [40]:
sub['dernier statut'].value_counts()

Livré                  59211
Retourné au vendeur     8424
Transfert                116
Vers Wilaya               75
Retour à retirer          36
Expédié                    7
Retour groupé              2
Name: dernier statut, dtype: int64