In [1]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/cornershop/venv/lib/python3.9/site-packages')

# Days between orders

In [2]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Brands/Requests'
FILE_BASE_NAME = 'dbo_82'

In [3]:
import re
import os

FILE_NAME_LIST = [x for x in os.listdir(BASE_DIR) if re.search(fr'{FILE_BASE_NAME}_\d+\.csv',x)!=None]
print(sorted(FILE_NAME_LIST))

['dbo_82_210101.csv', 'dbo_82_210102.csv', 'dbo_82_210103.csv', 'dbo_82_210104.csv', 'dbo_82_210201.csv', 'dbo_82_210202.csv', 'dbo_82_210203.csv', 'dbo_82_210204.csv', 'dbo_82_210301.csv', 'dbo_82_210302.csv', 'dbo_82_210303.csv', 'dbo_82_210304.csv', 'dbo_82_210401.csv', 'dbo_82_210402.csv', 'dbo_82_210500.csv', 'dbo_82_210501.csv', 'dbo_82_210502.csv', 'dbo_82_210601.csv', 'dbo_82_210602.csv', 'dbo_82_2107.csv']


## Functions

### Timing and tone

In [4]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

## Import

In [5]:
import pandas as pd

df = pd.DataFrame()
for file_name in FILE_NAME_LIST:
    df = df.append(pd.read_csv(os.path.join(BASE_DIR,file_name)),ignore_index=True)
print(len(df))
display(df.sample())

25142


Unnamed: 0,order_date,order_id,user_id,requested,found,sales
24938,2021/03/27,30302985,12585573,2.0,2.0,190.0


## Transform

### Date variables

In [6]:
df['order_date'] = pd.to_datetime(df['order_date'])
df[['year','week','weekday']] = df['order_date'].dt.isocalendar()
df['month'] = df['order_date'].dt.month
df['bimonth'] = df['month'].map(dict(zip(range(1,13),[x//2+1 for x in range(12)])))
df['year_month'] = df['year'].astype(str)+'_'+df['month'].apply(lambda x: str(x).zfill(2))
df.sample()

Unnamed: 0,order_date,order_id,user_id,requested,found,sales,year,week,weekday,month,bimonth,year_month
17554,2021-01-05,25221760,9838195,1.0,1.0,73.0,2021,1,2,1,1,2021_01


### Proper order

In [7]:
df.sort_values(['user_id','order_date','order_id'],inplace=True)
df.reset_index(drop=True, inplace=True)

### Order number

In [8]:
df['n_order'] = df.groupby(['user_id'])['order_date'].rank()

### Last order date

In [9]:
df['last_order_date'] = df.groupby(['user_id'])['order_date'].shift()

### Days between orders

In [10]:
df['dbo'] = (df['order_date']-df['last_order_date']).apply(lambda x: x.days)

### Days between orders range

In [11]:
df['dbo_cut'] = pd.cut(df['dbo'],bins=[-1,0,15,30,60,200])
df['dbo_cut'] = df['dbo_cut'].apply(lambda x:str(int(x.left)+1).zfill(2)+' a '+str(int(x.right)).zfill(2))

### n-th order range

In [12]:
df['n_order_cut'] = pd.cut(df['n_order'],bins=[0,1,2,3,5,10,15,30,1e3])
df['n_order_cut'] = df['n_order_cut'].apply(lambda x:str(int(x.left)+1).zfill(2)+' a '+str(int(x.right)).zfill(2))

### Example

In [13]:
most_loyal_user = df['user_id'].value_counts().index.tolist()[0]
df[df['user_id']==most_loyal_user].head(7)

Unnamed: 0,order_date,order_id,user_id,requested,found,sales,year,week,weekday,month,bimonth,year_month,n_order,last_order_date,dbo,dbo_cut,n_order_cut
6602,2021-01-07,25307100,689117,1.0,1.0,81.0,2021,1,4,1,1,2021_01,1.0,NaT,,,01 a 01
6603,2021-01-25,26443530,689117,1.0,1.0,86.15,2021,4,1,1,1,2021_01,2.0,2021-01-07,18.0,16 a 30,02 a 02
6604,2021-01-27,26526467,689117,1.0,1.0,86.15,2021,4,3,1,1,2021_01,3.0,2021-01-25,2.0,01 a 15,03 a 03
6605,2021-01-30,26724358,689117,1.0,1.0,87.0,2021,4,6,1,1,2021_01,4.0,2021-01-27,3.0,01 a 15,04 a 05
6606,2021-02-01,26836416,689117,1.0,1.0,87.0,2021,5,1,2,1,2021_02,5.0,2021-01-30,2.0,01 a 15,04 a 05
6607,2021-02-06,27129502,689117,1.0,1.0,90.0,2021,5,6,2,1,2021_02,6.0,2021-02-01,5.0,01 a 15,06 a 10
6608,2021-02-09,27254851,689117,4.0,2.0,174.0,2021,6,2,2,1,2021_02,7.0,2021-02-06,3.0,01 a 15,06 a 10


### Frequency

In [14]:
by_month = df.fillna({'dbo':0}).pivot_table(index='year_month', columns='dbo_cut', values='user_id', aggfunc='nunique', fill_value=0, margins=True, margins_name='Total')
by_month.columns = pd.MultiIndex.from_tuples([('días después de la compra anterior',x) for x in by_month.columns], names=['Usuarios únicos',''])
by_month

Usuarios únicos,días después de la compra anterior,días después de la compra anterior,días después de la compra anterior,días después de la compra anterior,días después de la compra anterior,días después de la compra anterior
Unnamed: 0_level_1,00 a 00,01 a 15,16 a 30,31 a 60,61 a 200,Total
year_month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2020_01,0,2,0,0,0,2
2021_01,13,474,126,0,0,593
2021_02,12,566,480,217,0,1131
2021_03,14,530,472,522,107,1424
2021_04,8,434,391,450,308,1371
2021_05,7,462,423,455,473,1552
2021_06,16,429,403,429,536,1584
2021_07,6,301,282,260,319,1073
Total,72,2071,1850,1843,1683,4594


### n-th order

Unnamed: 0,user_id,n_order_x,order_date,order_id,requested,found,sales,year,week,weekday,month,bimonth,year_month,n_order_y,last_order_date,dbo,dbo_cut,n_order_cut
0,148,1.0,2021-02-07,27190140,1.0,1.0,86.56,2021,5,7,2,1,2021_02,1.0,NaT,,,01 a 01
1,163,2.0,2021-05-06,33301486,1.0,1.0,90.00,2021,18,4,5,3,2021_05,1.0,NaT,,,01 a 01
2,163,2.0,2021-05-31,34938908,1.0,1.0,90.00,2021,22,1,5,3,2021_05,2.0,2021-05-06,25.0,16 a 30,02 a 02
3,862,1.0,2021-06-30,36954324,1.0,1.0,90.00,2021,26,3,6,3,2021_06,1.0,NaT,,,01 a 01
4,932,2.0,2021-01-08,25415898,2.0,1.0,86.50,2021,1,5,1,1,2021_01,1.0,NaT,,,01 a 01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25137,17971541,1.0,2021-07-17,38008110,4.0,3.0,111.00,2021,28,6,7,4,2021_07,1.0,NaT,,,01 a 01
25138,17972750,1.0,2021-07-17,38002125,2.0,2.0,139.00,2021,28,6,7,4,2021_07,1.0,NaT,,,01 a 01
25139,17978282,1.0,2021-07-17,38025417,2.0,1.0,48.50,2021,28,6,7,4,2021_07,1.0,NaT,,,01 a 01
25140,17995489,1.0,2021-07-18,38061378,2.0,2.0,74.00,2021,28,7,7,4,2021_07,1.0,NaT,,,01 a 01


In [22]:
by_norders = df.groupby('user_id')['n_order'].max().reset_index().merge(df, on='user_id')
by_norders = by_norders.pivot_table(index='year_month', columns='n_order_cut', values='user_id', aggfunc='nunique', fill_value=0, margins=True, margins_name='Total')
by_norders.rename(columns={'01 a 01':'01','02 a 02':'02','03 a 03':'03','31 a 1000':'>30'}, inplace=True)
by_norders.columns = pd.MultiIndex.from_tuples([('orden número X',x) for x in by_norders.columns], names=['Usuarios únicos',''])
by_norders

Usuarios únicos,orden número X,orden número X,orden número X,orden número X,orden número X,orden número X,orden número X,orden número X,orden número X
Unnamed: 0_level_1,01,02,03,04 a 05,06 a 10,11 a 15,16 a 30,>30,Total
year_month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2020_01,247,2,0,0,0,0,0,0,247
2021_01,3448,589,132,38,7,1,0,0,3523
2021_02,2392,830,379,182,47,0,0,0,3324
2021_03,2136,836,425,310,130,13,1,0,3369
2021_04,1693,663,390,323,188,27,7,0,2947
2021_05,1449,657,407,413,269,50,14,0,2890
2021_06,1466,647,394,362,295,60,25,1,2950
2021_07,855,355,194,278,230,63,22,1,1895
Total,13686,4579,2321,1387,601,119,32,1,13717


## Export

In [16]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(os.path.join(BASE_DIR,FILE_BASE_NAME+'_weekly.xlsx'), engine='xlsxwriter')
# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
# Add a format
#format = workbook.add_format({'bg_color': '#3973bf','font_color': '#fafafa'})

for data,sheet in zip([df,by_month,by_norders],['data','frecuencia','nésima_orden']):
    # Write each dataframe to a different worksheet.
    data.to_excel(writer, sheet_name=sheet)
    # Apply a conditional format to the cell range.
    # worksheet = writer.sheets[sheet]
    # worksheet.conditional_format(start_row, start_col, end_row, end_cold,{'type': 'cell','criteria': '>','value': 20,'format': format1})
    
# Close the Pandas Excel writer and output the Excel file.
writer.save()

## End

In [17]:
time_exp(time.time()-start)
tono()

0 minutos con 9.17 segundos
