# [Look](https://cornershopapp.cloud.looker.com/looks/5045?toggle=dat,fil,pik)

# Parameters

In [14]:
BASE_DIR = '/Users/efrain.flores/Desktop/EF/Corner/Requests/new_users_at_store_month'
FILE_BASE_NAME = 'new_users_at_store_20'

# Code

In [15]:
# Control de datos
from pathlib import Path

# Ingeniería de variables
from re import search as re_search
from pandas import DataFrame, Series, read_csv, to_datetime

class NewUsers:
    def __init__(self, base_dir: str, file_base_name: str) -> None:
        # Convierte el texto a objeto tipo Path para unir directorios, buscar archivos, etc
        self.base_dir = Path(base_dir)
        # Asigna el nombre base como atributo 
        self.file_name = file_base_name
        # Encuentra todos los archivos que comiencen con el nombre base en el directorio
        self.files_list = [x for x in self.base_dir.glob('*.csv') if re_search(f'{self.file_name}_*(?!result).+\.csv', str(x))]
        self.stores_file = self.base_dir.joinpath('stores.csv')

    
    def read_files(self) -> DataFrame:
        '''
        Une todos los archivos que comienzan con el nombre base
        '''
        # Tabla vacía para ir depositando los csv
        self.df = DataFrame()
        # Obtiene el número de archivos, sólo es informativo
        total_files = len(self.files_list)

        for i,file_chunk in enumerate(self.files_list):
            # Obtener sólo el nombre del archivo, no su ubicación completa
            sub_name = str(file_chunk).split('/')[-1]
            aux = read_csv(file_chunk)
            aux['file'] = sub_name
            # Une la tabla anterior con el nuevo archivo
            self.df = self.df.append(aux, ignore_index=True)
            
            # Informa al usuario del avance
            print(f'Archivo {i+1}/{total_files} con nombre: "{sub_name}" es importado exitosamente')

        self.stores = read_csv(self.stores_file)
        self.df = self.df.merge(self.stores, left_on='Store ID', right_on='store_id', how='left')


    def get_date_partitions(self, date_col: str)-> None:
        self.df[date_col] = to_datetime(self.df[date_col])
        self.df['year'] = self.df[date_col].dt.year
        self.df['month'] = self.df[date_col].dt.month
        self.df['week'] = self.df[date_col].dt.isocalendar().week
        self.df['quarter'] = self.df[date_col].dt.quarter
        self.df['year_month'] = self.df['year'].astype(str) + '_' + self.df['month'].map(lambda x: str(x).zfill(2))
        self.df['year_week'] = self.df['year'].astype(str) + '_' + self.df['week'].map(lambda x: str(x).zfill(2))
        self.df['year_quarter'] = self.df['year'].astype(str) + '_' + self.df['quarter'].map(lambda x: str(x).zfill(2))


    def user_rank(self, date_col: str, cols_to_group: list, cols_to_sum: list) -> DataFrame:
        data = nu.df.groupby(cols_to_group)[cols_to_sum].sum().reset_index()
        
        tmp = data.groupby([x for x in cols_to_group if x!=date_col]).size()
        rank = tmp.map(range)
        rank =[item for sublist in rank for item in sublist]
        data['user_rank'] = rank
        data['user_rank'] = data['user_rank'] + 1

        data['is_new'] = data['user_rank'].map(lambda x: 'New' if x==1 else 'Not-new')
        
        tot_dates = data[[date_col]].drop_duplicates().sort_values(date_col).reset_index(drop=True)
        tot_dates.reset_index(inplace=True)
        data = data.merge(tot_dates)
        data.sort_values(cols_to_group, inplace=True)
        data['prev_date'] = data.groupby([x for x in cols_to_group if x!=date_col])['index'].shift()
        data['date_diff'] = data['index'] - data['prev_date']
        data['date_diff_group'] = data['date_diff'].map(lambda x: 3 if x>=3 else x)
        return data


    def summary(self, data: DataFrame, **kwargs) -> DataFrame:
        data = data.pivot_table(**kwargs)
        data.columns = ['_'.join(x) for x in data.columns]
        return data.reset_index()

# Transform

## Read files

In [16]:
nu = NewUsers(BASE_DIR, FILE_BASE_NAME)
nu.files_list

[PosixPath('/Users/efrain.flores/Desktop/EF/Corner/Requests/new_users_at_store_month/new_users_at_store_2021.csv'),
 PosixPath('/Users/efrain.flores/Desktop/EF/Corner/Requests/new_users_at_store_month/new_users_at_store_2020.csv'),
 PosixPath('/Users/efrain.flores/Desktop/EF/Corner/Requests/new_users_at_store_month/new_users_at_store_2022_H1.csv'),
 PosixPath('/Users/efrain.flores/Desktop/EF/Corner/Requests/new_users_at_store_month/new_users_at_store_2022_07.csv')]

In [17]:
nu.read_files()
nu.df.head()

Archivo 1/4 con nombre: "new_users_at_store_2021.csv" es importado exitosamente
Archivo 2/4 con nombre: "new_users_at_store_2020.csv" es importado exitosamente
Archivo 3/4 con nombre: "new_users_at_store_2022_H1.csv" es importado exitosamente
Archivo 4/4 con nombre: "new_users_at_store_2022_07.csv" es importado exitosamente


Unnamed: 0.1,Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,file,store_id,store,store_is_active,store_category_id,store_category,store_family,store_type,store_group
0,1,58,22,2021-11,79,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,Supermercados,Supermercados,Groceries,Supermarket
1,2,1276465,22,2021-10,79,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,Supermercados,Supermercados,Groceries,Supermarket
2,3,4825433,7,2021-04,73,new_users_at_store_2021.csv,7.0,Costco,True,13.0,Supermercados,Supermercados,Groceries,Supermarket
3,4,58,22,2021-12,70,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,Supermercados,Supermercados,Groceries,Supermarket
4,5,1276465,22,2021-12,68,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,Supermercados,Supermercados,Groceries,Supermarket


## Date partition

In [18]:
nu.get_date_partitions(date_col='Delivered at Local Time Dynamic')
nu.df.head(11)

Unnamed: 0.1,Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,file,store_id,store,store_is_active,store_category_id,...,store_family,store_type,store_group,year,month,week,quarter,year_month,year_week,year_quarter
0,1,58,22,2021-11-01,79,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,...,Supermercados,Groceries,Supermarket,2021,11,44,4,2021_11,2021_44,2021_04
1,2,1276465,22,2021-10-01,79,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,...,Supermercados,Groceries,Supermarket,2021,10,39,4,2021_10,2021_39,2021_04
2,3,4825433,7,2021-04-01,73,new_users_at_store_2021.csv,7.0,Costco,True,13.0,...,Supermercados,Groceries,Supermarket,2021,4,13,2,2021_04,2021_13,2021_02
3,4,58,22,2021-12-01,70,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,...,Supermercados,Groceries,Supermarket,2021,12,48,4,2021_12,2021_48,2021_04
4,5,1276465,22,2021-12-01,68,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,...,Supermercados,Groceries,Supermarket,2021,12,48,4,2021_12,2021_48,2021_04
5,6,1492255,22,2021-02-01,68,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,...,Supermercados,Groceries,Supermarket,2021,2,5,1,2021_02,2021_05,2021_01
6,7,293276,9,2021-09-01,68,new_users_at_store_2021.csv,9.0,City Market,True,13.0,...,Supermercados,Groceries,Supermarket,2021,9,35,3,2021_09,2021_35,2021_03
7,8,493678,22,2021-10-01,68,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,...,Supermercados,Groceries,Supermarket,2021,10,39,4,2021_10,2021_39,2021_04
8,9,58,22,2021-10-01,68,new_users_at_store_2021.csv,22.0,Chedraui,True,13.0,...,Supermercados,Groceries,Supermarket,2021,10,39,4,2021_10,2021_39,2021_04
9,10,293276,9,2021-11-01,67,new_users_at_store_2021.csv,9.0,City Market,True,13.0,...,Supermercados,Groceries,Supermarket,2021,11,44,4,2021_11,2021_44,2021_04


In [19]:
nu.df.pivot_table(index='month', columns='year', values='Orders Count', aggfunc=sum)

year,2020,2021,2022
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,148429.0,495783.0,553265.0
2,148551.0,454416.0,480259.0
3,229134.0,473959.0,512266.0
4,395980.0,449658.0,457545.0
5,523654.0,460624.0,526508.0
6,482154.0,444776.0,514983.0
7,456886.0,444442.0,496136.0
8,440091.0,486005.0,
9,407977.0,444039.0,
10,414899.0,443272.0,


## User rank

In [20]:
rank_store = nu.user_rank(date_col='year_month', cols_to_group=['User ID', 'Store ID', 'year_month'], cols_to_sum=['Orders Count'])
rank_store.head(11)

Unnamed: 0,User ID,Store ID,year_month,Orders Count,user_rank,is_new,index,prev_date,date_diff,date_diff_group
0,6,5,2020_01,1,1,New,0,,,
78648,6,5,2020_05,2,2,Not-new,4,0.0,4.0,3.0
360765,6,7,2020_04,1,1,New,3,,,
590655,6,9,2020_09,1,1,New,8,,,
796247,6,9,2020_10,6,2,Not-new,9,8.0,1.0,1.0
1002457,6,9,2020_11,5,3,Not-new,10,9.0,1.0,1.0
1201376,6,9,2020_12,5,4,Not-new,11,10.0,1.0,1.0
1439900,6,9,2021_01,1,5,Not-new,12,11.0,1.0,1.0
1689693,6,9,2021_02,3,6,Not-new,13,12.0,1.0,1.0
1929717,6,9,2021_03,3,7,Not-new,14,13.0,1.0,1.0


# Summary

## by store

In [21]:
summary_store = nu.summary(data=rank_store, index=['Store ID', 'year_month'], columns='is_new', aggfunc={'User ID':Series.nunique, 'Orders Count':'sum'})
summary_store = summary_store.merge(nu.stores, left_on='Store ID', right_on='store_id')
summary_store.head(11)

Unnamed: 0,Store ID,year_month,Orders Count_New,Orders Count_Not-new,User ID_New,User ID_Not-new,store_id,store,store_is_active,store_category_id,store_category,store_family,store_type,store_group
0,7,2020_01,19703.0,,12466.0,,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
1,7,2020_02,7620.0,12280.0,6467.0,6343.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
2,7,2020_03,19004.0,20671.0,15116.0,9962.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
3,7,2020_04,32689.0,30283.0,24421.0,15453.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
4,7,2020_05,24141.0,48214.0,18668.0,24707.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
5,7,2020_06,12078.0,49659.0,10027.0,26778.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
6,7,2020_07,10716.0,50751.0,9016.0,27584.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
7,7,2020_08,8353.0,50023.0,7102.0,27339.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
8,7,2020_09,6989.0,46345.0,6149.0,26059.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
9,7,2020_10,6655.0,48879.0,5786.0,26614.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket


## by store-family

In [22]:
rank_storefam = nu.user_rank(date_col='year_month', cols_to_group=['User ID', 'store_family', 'year_month'], cols_to_sum=['Orders Count'])
rank_storefam.head(11)

Unnamed: 0,User ID,store_family,year_month,Orders Count,user_rank,is_new,index,prev_date,date_diff,date_diff_group
0,6,Express,2020_12,1,1,New,11,,,
161171,6,Express,2021_03,1,2,Not-new,14,11.0,3.0,3.0
339815,6,Express,2021_10,1,3,Not-new,21,14.0,7.0,3.0
511445,6,Express,2021_12,2,4,Not-new,23,21.0,2.0,2.0
708671,6,Express,2022_03,1,5,Not-new,26,23.0,3.0,3.0
929435,6,Gourmet & Vida sana,2020_04,2,1,New,3,,,
1,6,Gourmet & Vida sana,2020_12,1,2,Not-new,11,3.0,8.0,3.0
161172,6,Gourmet & Vida sana,2021_03,2,3,Not-new,14,11.0,3.0,3.0
1054004,6,Gourmet & Vida sana,2021_06,1,4,Not-new,17,14.0,3.0,3.0
1229921,6,"Mascotas, Hogar, trabajo",2020_03,2,1,New,2,,,


In [23]:
summary_storefam = nu.summary(data=rank_storefam, index=['store_family', 'year_month'], columns='is_new', aggfunc={'User ID':Series.nunique, 'Orders Count':'sum'})
summary_storefam.head(11)

Unnamed: 0,store_family,year_month,Orders Count_New,Orders Count_Not-new,User ID_New,User ID_Not-new
0,"Belleza, Moda & Deportes",2020_01,24.0,,22.0,
1,"Belleza, Moda & Deportes",2020_02,23.0,4.0,21.0,3.0
2,"Belleza, Moda & Deportes",2020_03,59.0,6.0,57.0,5.0
3,"Belleza, Moda & Deportes",2020_04,84.0,4.0,79.0,4.0
4,"Belleza, Moda & Deportes",2020_05,595.0,15.0,338.0,7.0
5,"Belleza, Moda & Deportes",2020_06,705.0,130.0,450.0,65.0
6,"Belleza, Moda & Deportes",2020_07,647.0,168.0,408.0,96.0
7,"Belleza, Moda & Deportes",2020_08,562.0,227.0,355.0,123.0
8,"Belleza, Moda & Deportes",2020_09,378.0,254.0,255.0,143.0
9,"Belleza, Moda & Deportes",2020_10,389.0,289.0,263.0,160.0


# Export

In [24]:
summary_store.to_csv(nu.base_dir.joinpath(f'{nu.file_name}_result_store.csv'), sep='\t', encoding='utf-16', index=False)
summary_storefam.to_csv(nu.base_dir.joinpath(f'{nu.file_name}_result_storefam.csv'), sep='\t', encoding='utf-16', index=False)