In [1]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/cornershop/venv/lib/python3.9/site-packages')

# Days between orders

In [2]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Brands/Requests'
FILE_BASE_NAME = 'dbo_82'

In [3]:
import re
import os

FILE_NAME_LIST = [x for x in os.listdir(BASE_DIR) if re.search(fr'{FILE_BASE_NAME}_\d+\.csv',x)!=None]
print(sorted(FILE_NAME_LIST))

['dbo_82_210101.csv', 'dbo_82_210102.csv', 'dbo_82_210103.csv', 'dbo_82_210104.csv', 'dbo_82_210201.csv', 'dbo_82_210202.csv', 'dbo_82_210203.csv', 'dbo_82_210204.csv', 'dbo_82_210301.csv', 'dbo_82_210302.csv', 'dbo_82_210303.csv', 'dbo_82_210304.csv', 'dbo_82_210401.csv', 'dbo_82_210402.csv', 'dbo_82_210500.csv', 'dbo_82_210501.csv', 'dbo_82_210502.csv', 'dbo_82_210601.csv', 'dbo_82_210602.csv', 'dbo_82_2107.csv']


## Functions

### Timing and tone

In [4]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

## Import

In [5]:
import pandas as pd

df = pd.DataFrame()
for file_name in FILE_NAME_LIST:
    df = df.append(pd.read_csv(os.path.join(BASE_DIR,file_name)),ignore_index=True)
print(len(df))
display(df.sample())

25142


Unnamed: 0,order_date,order_id,user_id,requested,found,sales
3559,2021/02/02,26876422,130745,2.0,1.0,90.0


## Transform

### Date variables

In [6]:
df['order_date'] = pd.to_datetime(df['order_date'])
df[['year','week','weekday']] = df['order_date'].dt.isocalendar()
df['month'] = df['order_date'].dt.month
df['bimonth'] = df['month'].map(dict(zip(range(1,13),[x//2+1 for x in range(12)])))
df['year_week'] = df['year'].astype(str)+'_'+df['week'].apply(lambda x: str(x).zfill(2))
df.sample()

Unnamed: 0,order_date,order_id,user_id,requested,found,sales,year,week,weekday,month,bimonth,year_week
4220,2021-02-06,27062147,4106398,1.0,1.0,39.0,2021,5,6,2,1,2021_05


### Proper order

In [7]:
df.sort_values(['user_id','order_date','order_id'],inplace=True)

### Order number

In [8]:
df['n_order'] = df.groupby(['user_id'])['order_date'].rank()

### Last order date

In [9]:
df['last_order_date'] = df.groupby(['user_id'])['order_date'].shift()

### Days between orders

In [10]:
df['dbo'] = (df['order_date']-df['last_order_date']).apply(lambda x: x.days)

### Days between orders range

In [11]:
#df['dbo_cut'],dbo_bins = pd.qcut(df['dbo'], q=10, retbins=True)
df['dbo_cut'] = pd.cut(df['dbo'],bins=[-1,0,6,8,12,15,21,28,36,50,77,196])
df['dbo_cut'] = df['dbo_cut'].apply(lambda x:str(int(x.left)+1).zfill(2)+' a '+str(int(x.right)).zfill(2))

### Example

In [12]:
most_loyal_user = df['user_id'].value_counts().index.tolist()[0]
df[df['user_id']==most_loyal_user].head(7)

Unnamed: 0,order_date,order_id,user_id,requested,found,sales,year,week,weekday,month,bimonth,year_week,n_order,last_order_date,dbo,dbo_cut
17715,2021-01-07,25307100,689117,1.0,1.0,81.0,2021,1,4,1,1,2021_01,1.0,NaT,,
19934,2021-01-25,26443530,689117,1.0,1.0,86.15,2021,4,1,1,1,2021_04,2.0,2021-01-07,18.0,16 a 21
20130,2021-01-27,26526467,689117,1.0,1.0,86.15,2021,4,3,1,1,2021_04,3.0,2021-01-25,2.0,01 a 06
20671,2021-01-30,26724358,689117,1.0,1.0,87.0,2021,4,6,1,1,2021_04,4.0,2021-01-27,3.0,01 a 06
3475,2021-02-01,26836416,689117,1.0,1.0,87.0,2021,5,1,2,1,2021_05,5.0,2021-01-30,2.0,01 a 06
4411,2021-02-06,27129502,689117,1.0,1.0,90.0,2021,5,6,2,1,2021_05,6.0,2021-02-01,5.0,01 a 06
1701,2021-02-09,27254851,689117,4.0,2.0,174.0,2021,6,2,2,1,2021_06,7.0,2021-02-06,3.0,01 a 06


### Grouping

In [13]:
by_week = df.fillna({'dbo':0}).pivot_table(index='year_week',columns='dbo_cut',values='user_id',aggfunc='nunique',fill_value=0)
by_week

dbo_cut,00 a 00,01 a 06,07 a 08,09 a 12,13 a 15,16 a 21,22 a 28,29 a 36,37 a 50,51 a 77,78 a 196
year_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020_53,0,2,0,0,0,0,0,0,0,0,0
2021_01,2,40,7,0,0,0,0,0,0,0,0
2021_02,6,63,39,36,10,3,0,0,0,0,0
2021_03,5,56,53,56,44,26,4,0,0,0,0
2021_04,2,56,59,49,56,65,28,0,0,0,0
2021_05,6,78,62,52,63,78,71,22,0,0,0
2021_06,2,61,47,53,42,48,39,46,16,0,0
2021_07,3,46,45,44,50,59,53,57,36,0,0
2021_08,1,30,39,51,41,51,49,48,46,4,0
2021_09,3,61,39,41,39,57,69,59,57,22,0


## Export

In [14]:
df.to_csv(os.path.join(BASE_DIR,FILE_BASE_NAME+'_total.csv'),sep='\t',encoding='utf-16',index=False)
by_week.to_excel(os.path.join(BASE_DIR,FILE_BASE_NAME+'_weekly.xlsx'))

## End

In [15]:
time_exp(time.time()-start)
tono()

0 minutos con 1.61 segundos
