# Brands similarity

In [1]:
from pathlib import Path

BASE_DIR = Path('/Users/efraflores/Desktop/EF/Corner/Catalog/Canceled_orders')
FILE_BASE_NAME = 'canceled'
FILE_LIST = sorted([x for x in BASE_DIR.glob('**/*') if str(x).split('/')[-1][:-7]==FILE_BASE_NAME and x.is_file()])

print(FILE_LIST)

[PosixPath('/Users/efraflores/Desktop/EF/Corner/Catalog/Canceled_orders/canceled_26.csv'), PosixPath('/Users/efraflores/Desktop/EF/Corner/Catalog/Canceled_orders/canceled_27.csv'), PosixPath('/Users/efraflores/Desktop/EF/Corner/Catalog/Canceled_orders/canceled_28.csv'), PosixPath('/Users/efraflores/Desktop/EF/Corner/Catalog/Canceled_orders/canceled_29.csv')]


## Functions

### Timing and tone

In [2]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

## Import

In [3]:
import pandas as pd

df = pd.DataFrame()
for csv_file in FILE_LIST:
    df = df.append(pd.read_csv(csv_file, header=1, usecols=range(14), sep='\t', encoding='utf-16').drop_duplicates(), ignore_index=True)

df.columns = [str(x).replace('-','').replace('.','').replace('  ',' ').replace(' ','_').lower() for x in df.columns]

print(len(df))
display(df.sample())

30291


Unnamed: 0,order_external_id,country,city,zone,platform,store_type,store_category,id_store_name,id_branch_name,order_kind,status,cancelled_reason,cancelled_stage,with_subscruption
2760,MX-229645-9775773,MX,Ciudad de México,Bosques,Cornershop,Full Service,Grocery,7 - Costco,40 - Interlomas,NORMAL,CANCELED_BY_USER,UNKNOWN (BY USER),Before being assigned,False


## Transform

### Copy to clipboard

In [4]:
df['order_external_id'].apply(lambda x: "'"+str(x)+"',").to_clipboard(index=False, header=False)

### Merge with query-result

In [5]:
df = df.merge(pd.read_csv(BASE_DIR.joinpath('by_brand.csv')), left_on='order_external_id', right_on='external_id', how='left')
df.drop(columns=['external_id'], inplace=True)
df.sample()

Unnamed: 0,order_external_id,country,city,zone,platform,store_type,store_category,id_store_name,id_branch_name,order_kind,status,cancelled_reason,cancelled_stage,with_subscruption,date,brand,requested,found,total_cart
99596,MX-640849-0021907,MX,Ciudad de México,Condesa,Cornershop,Full Service,Grocery,22 - Chedraui,138 - Universidad,NORMAL,CANCELED_BY_CM,DUPLICATED,Before being assigned,False,2021-07-12,Selecto,5.0,0.0,12.5


### Null values

In [6]:
null_dict = {'zone':'unknown','date':'2000-01-01','brand':'SIN MARCA','requested':0,'found':0,'total_cart':0}
df.fillna(null_dict, inplace=True)

### Date variables

In [7]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week

### Split name and id

In [8]:
for col in ['id_store_name','id_branch_name']:
    col_id, col_name = '_'.join(col.split('_')[:-1]) , '_'.join(col.split('_')[1:])
    df[[col_id, col_name]] = df[col].str.split('-', 1, expand=True)
    del col_id, col_name

### Top10 brands by week-store

In [9]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(BASE_DIR.joinpath(f'store_week.xlsx'), engine='xlsxwriter')
# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book

for week in set(df['week']):
    store = df.set_index('week').loc[week,:].reset_index()
    store = store.pivot_table(index=['week','country','city','zone','store_type','store_category','id_store','store_name'], columns=['brand'], values='order_external_id', aggfunc='count', fill_value=0, margins=True, margins_name='Total')
    store = store.sort_values(('Total','','','','','','',''),axis=1, ascending=False).sort_values('Total',0,0).iloc[:11,:11].reset_index()
    store.to_excel(writer, sheet_name=str(week).zfill(2), index=False)

writer.save()

### Top10 brands by week-zone

In [10]:
writer = pd.ExcelWriter(BASE_DIR.joinpath(f'zone_week.xlsx'), engine='xlsxwriter')
workbook  = writer.book

for week in set(df['week']):
    zone = df.set_index('week').loc[week,:].reset_index()
    zone = zone.pivot_table(index=['week','country','city','zone'], columns=['brand'], values='order_external_id', aggfunc='count', fill_value=0, margins=True, margins_name='Total')
    zone = zone.sort_values(('Total','','',''),axis=1, ascending=False).sort_values('Total',0,0).iloc[:11,:11].reset_index()
    zone.to_excel(writer, sheet_name=str(week).zfill(2), index=False)

writer.save()

### FR and sales

In [11]:
df.sample()

Unnamed: 0,order_external_id,country,city,zone,platform,store_type,store_category,id_store_name,id_branch_name,order_kind,...,requested,found,total_cart,year,month,week,id_store,store_name,id_branch,branch_name
92978,MX-359806-0798187,MX,Ciudad de México,Lejos Sur,Cornershop,Full Service,Grocery,1423 - La Comer,4452 - La Comer Villa Coapa,NORMAL,...,2.0,2.0,29.95,2021,7,29,1423,La Comer,4452,La Comer Villa Coapa


## Export

In [12]:
export.set_index('neighbors').to_excel(BASE_DIR.joinpath(FILE_BASE_NAME+'_similar.xlsx'))

## End

In [13]:
time_exp(time.time()-start)
tono()

0 minutos con 2.05 segundos
