# **Olist Customer Segmentation Modeling**

# Derived features

## Order processing time

Time features derived from the following `order`'s 5 dates:
1. $t_1$ = `purchase_timestamp`
2. $t_2$ = `approved_at`
3. $t_3$ = `delivered_carrier_date`
4. $t_4$ = `delivered_customer_date`
5. $t_5$ = `estimated_delivery_date`

are 10 intervals in days between each of these dates, among which the total time between the order and its delivery to the customer :
* $dt_{12}$ = `approval_time` = $t_2 - t_1$
* $dt_{13}$ = `carrier_delivering_time`
* $dt_{14}$ = `customer_delivering_time`
* $dt_{15}$ = `processing_estimated_time`
* $dt_{23}$ = `approval_to_carrier_delivery_time`
* $dt_{24}$ = `approval_to_customer_delivery_time`
* $dt_{25}$ = `approval_to_estimated_delivery_time`
* $dt_{34}$ = `transit_time`
* $dt_{35}$ = `estimated_transit_time`
* $dt_{45}$ = `delivery_advance_time`


In [9]:
from pepper_commons import discrete_stats
from olist_commons import get_order_times
order_times = get_order_times()
display(discrete_stats(order_times))

Unnamed: 0,n,n_u,n_na,Filling rate,Shannon entropy,dtypes
approval_time,99281,33475,160,0.998391,0.337174,timedelta64[ns]
carrier_delivering_time,97658,88352,1783,0.98207,0.904708,timedelta64[ns]
customer_delivering_time,96476,93809,2965,0.970183,0.972356,timedelta64[ns]
processing_estimated_time,99441,96677,0,1.0,0.972205,timedelta64[ns]
approval_to_carrier_delivery_time,97644,87498,1797,0.981929,0.896092,timedelta64[ns]
approval_to_customer_delivery_time,96462,93853,2979,0.970043,0.972953,timedelta64[ns]
approval_to_estimated_delivery_time,99281,92338,160,0.998391,0.930067,timedelta64[ns]
transit_time,96475,92239,2966,0.970173,0.956092,timedelta64[ns]
estimated_transit_time,97658,81053,1783,0.98207,0.829968,timedelta64[ns]
delivery_advance_time,96476,91915,2965,0.970183,0.952724,timedelta64[ns]


In [8]:
from pepper_commons import discrete_stats
from olist_commons import (
    get_order_times,
    index_of_delivered_orders,
    index_of_undelivered_orders,
)

order_times = get_order_times(index_of_delivered_orders())
display(discrete_stats(order_times, 'delivered_order_times'))

Unnamed: 0_level_0,n,n_u,n_na,Filling rate,Shannon entropy,dtypes
order_times,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
approval_time,96464,32631,14,0.999855,0.338271,timedelta64[ns]
carrier_delivering_time,96476,87388,2,0.999979,0.9058,timedelta64[ns]
customer_delivering_time,96470,93803,8,0.999917,0.972354,timedelta64[ns]
processing_estimated_time,96478,93887,0,1.0,0.973144,timedelta64[ns]
approval_to_carrier_delivery_time,96462,86539,16,0.999834,0.89713,timedelta64[ns]
approval_to_customer_delivery_time,96456,93849,22,0.999772,0.972972,timedelta64[ns]
approval_to_estimated_delivery_time,96464,89848,14,0.999855,0.931415,timedelta64[ns]
transit_time,96469,92235,9,0.999907,0.95611,timedelta64[ns]
estimated_transit_time,96476,80125,2,0.999979,0.830517,timedelta64[ns]
delivery_advance_time,96470,91910,8,0.999917,0.952731,timedelta64[ns]


In [10]:
undelivered_order_times = get_order_times(index_of_undelivered_orders())
display(discrete_stats(undelivered_order_times, 'undelivered_order_times'))

Unnamed: 0_level_0,n,n_u,n_na,Filling rate,Shannon entropy,dtypes
undelivered_order_times,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
approval_time,2817,2008,146,0.950726,0.712815,timedelta64[ns]
carrier_delivering_time,1182,1180,1781,0.39892,0.998308,timedelta64[ns]
customer_delivering_time,6,6,2957,0.002025,1.0,timedelta64[ns]
processing_estimated_time,2963,2958,0,1.0,0.998313,timedelta64[ns]
approval_to_carrier_delivery_time,1182,1175,1781,0.39892,0.994078,timedelta64[ns]
approval_to_customer_delivery_time,6,6,2957,0.002025,1.0,timedelta64[ns]
approval_to_estimated_delivery_time,2817,2813,146,0.950726,0.99858,timedelta64[ns]
transit_time,6,6,2957,0.002025,1.0,timedelta64[ns]
estimated_transit_time,1182,1181,1781,0.39892,0.999154,timedelta64[ns]
delivery_advance_time,6,6,2957,0.002025,1.0,timedelta64[ns]


## Products volume and density

Physical features derived from `weight_g`, `length_cm`, `height_cm`, `width_cm` base physical features.

Defined only for the **32 340** dimensioned products.

In [12]:
from pepper_commons import discrete_stats
from olist_commons import (
    get_products,
    index_of_dimensioned_products
)
 
dimensioned_products = get_products(
    products_index=index_of_dimensioned_products()
)
display(discrete_stats(dimensioned_products, 'dimensioned_products'))

Unnamed: 0_level_0,n,n_u,n_na,Filling rate,Shannon entropy,dtypes
dimensioned_products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
category_name,32340,73,609,0.981517,0.002257,object
name_lenght,32340,66,609,0.981517,0.002041,object
description_lenght,32340,2960,609,0.981517,0.091528,object
photos_qty,32340,19,609,0.981517,0.000588,object
weight_g,32949,2204,0,1.0,0.066891,float64
length_cm,32949,99,0,1.0,0.003005,float64
height_cm,32949,102,0,1.0,0.003096,float64
width_cm,32949,95,0,1.0,0.002883,float64


In [2]:
from pepper_commons import discrete_stats
from olist_commons import (
    index_of_dimensioned_products,
    get_product_physical_features,
)

dimensioned_products_physicial_features = get_product_physical_features(
    products_index=index_of_dimensioned_products()
)
display(discrete_stats(
    dimensioned_products_physicial_features,
    'dimensioned_products_physicial_features'
))

Unnamed: 0_level_0,n,n_u,n_na,Filling rate,Shannon entropy,dtypes
dimensioned_products_physicial_features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
volume_cm^3,32949,4525,0,1.0,0.137333,float64
density_g_cm^-3,32949,13659,0,1.0,0.41455,float64


## Customer features

### Order count

Reminder :
* 97% of customers only ordered once
* 3% ordered twice
* Less than 0.25% of customers ordered more than twice.
    * Many of these customers have ordered up to 7 times.
    * One customer ordered 9 times, and another 17 times.
* Any customer has ordered at least once (no leads).
* We verfiy that $\sum{n_c} = 96 096$ and $\sum{n_c n_o} = 99 441$

In [1]:
from olist_commons import get_customer_order_counts
customer_order_counts = get_customer_order_counts()
display(customer_order_counts.sort_values(ascending=False))

customer_id
8d50f5eadf50201ccdcedfb9e2ac8455    17
3e43e6105506432c953e165fb2acf44c     9
6469f99c1f9dfae7733b25662e7f1782     7
ca77025e7201e3b30c44b472ff346268     7
1b6c7548a2a1f9037c1fd3ddfed95f33     7
                                    ..
5657dfebff5868c4dc7e8355fea865c4     1
5657596addb4d7b07b32cd330614bdf8     1
5656eb169546146caeab56c3ffc3d268     1
5656a8fabc8629ff96b2bc14f8c09a27     1
ffffd2657e2aad2907e67c3e9daecbeb     1
Name: order_id, Length: 96096, dtype: int64

# Dataset selections

## Between dates

### First and last dates

In [1]:
from olist_commons import (
    get_first_order_date,
    get_last_order_date,
    index_of_delivered_orders
)

from_date = get_first_order_date()  # + pd.Timedelta('3 day')
to_date = get_last_order_date()  # - pd.Timedelta('3 day')

display(from_date)
display(to_date)

delivered_orders_index = index_of_delivered_orders()

display(get_first_order_date(delivered_orders_index))
display(get_last_order_date(delivered_orders_index))

Timestamp('2016-09-04 21:15:19')

Timestamp('2018-10-17 17:30:18')

Timestamp('2016-09-15 12:16:38')

Timestamp('2018-08-29 15:00:37')

### Age in days of a dated order event

In [3]:
from olist_commons import (
    get_first_order_date,
    get_last_order_date,
    index_of_delivered_orders,
    index_of_undelivered_orders,
    get_order_event_ages
)
import datetime

default_ages = get_order_event_ages()
print('Earliest purchase:', default_ages.max())
print('Most recent purchase:', default_ages.min())

delivered_orders_index = index_of_delivered_orders()
delivered_orders_purchase_age = get_order_event_ages(
    orders_index=delivered_orders_index
)
print(
    'Earliest purchase of a delivered order:',
    delivered_orders_purchase_age.max()
)
print(
    'Most recent purchase of a delivered order:',
    delivered_orders_purchase_age.min()
)

ages_3 = get_order_event_ages(
    present_date=datetime.datetime(2017, 12, 31),
    orders_index=index_of_undelivered_orders(),
    from_date=datetime.datetime(2016, 1, 1),
    to_date=datetime.datetime(2016, 12, 31),
    event='approval'
)
print(
    'Earliest order approval of 2016, as of 12/31/2017:',
    ages_3.max()
)
print(
    'Most recent order approval of 2016, as of 12/31/2017:',
    ages_3.min()
)


Earliest purchase: 2318 days 23:24:30.181677
Most recent purchase: 1546 days 03:09:31.181677
Earliest purchase of a delivered order: 2308 days 08:23:11.181677
Most recent purchase of a delivered order: 1595 days 05:39:12.181677
Earliest order approval of 2016, as of 12/31/2017: 452 days 13:41:03
Most recent order approval of 2016, as of 12/31/2017: 438 days 08:40:49


## Customer table with R, F, M features

### Customer order recency

### Customer RFM

In [3]:
import pandas as pd
from olist_commons import (
    get_first_order_date,
    get_last_order_date,
    get_customer_RFM,
)

from_date = get_first_order_date() # + pd.Timedelta('3 day')
to_date = get_last_order_date() # - pd.Timedelta('3 day')

crfm = get_customer_RFM(from_date, to_date)
display(crfm)

# removing of outliers
crfm_1 = crfm[(crfm.F < 8) & (crfm.M < 10_000)]

TypeError: '>=' not supported between instances of 'str' and 'Timestamp'