# Parameters

In [40]:
BASE_DIR = '/Users/efrain.flores/Desktop/EF/Corner/Requests/new_users_at_store_week'
FILE_BASE_NAME = 'nu'

# Code

In [41]:
# Control de datos
from pathlib import Path

# Ingeniería de variables
from re import search as re_search
from pandas import DataFrame, Series, read_csv, to_datetime

class NewUsers:
    def __init__(self, base_dir: str, file_base_name: str) -> None:
        # Convierte el texto a objeto tipo Path para unir directorios, buscar archivos, etc
        self.base_dir = Path(base_dir)
        # Asigna el nombre base como atributo 
        self.file_name = file_base_name
        # Encuentra todos los archivos que comiencen con el nombre base en el directorio
        self.files_list = [x for x in self.base_dir.glob('*.csv') if re_search(f'{self.file_name}_(?!result).+\.csv', str(x))]
        self.stores_file = self.base_dir.joinpath('stores.csv')

    
    def read_files(self) -> DataFrame:
        '''
        Une todos los archivos que comienzan con el nombre base
        '''
        # Tabla vacía para ir depositando los csv
        self.df = DataFrame()
        # Obtiene el número de archivos, sólo es informativo
        total_files = len(self.files_list)

        for i,file_chunk in enumerate(self.files_list):
            # Obtener sólo el nombre del archivo, no su ubicación completa
            sub_name = str(file_chunk).split('/')[-1]
            aux = read_csv(file_chunk)
            aux['file'] = sub_name
            # Une la tabla anterior con el nuevo archivo
            self.df = self.df.append(aux, ignore_index=True)
            
            # Informa al usuario del avance
            print(f'Archivo {i+1}/{total_files} con nombre: "{sub_name}" es importado exitosamente')

        self.stores = read_csv(self.stores_file)
        self.df = self.df.merge(self.stores, left_on='Store ID', right_on='store_id', how='left')


    def get_date_partitions(self, date_col: str)-> None:
        self.df[date_col] = to_datetime(self.df[date_col])
        self.df['year'] = self.df[date_col].dt.year
        self.df['month'] = self.df[date_col].dt.month
        self.df['week'] = self.df[date_col].dt.isocalendar().week
        self.df['quarter'] = self.df[date_col].dt.quarter
        self.df['year_month'] = self.df['year'].astype(str) + '_' + self.df['month'].map(lambda x: str(x).zfill(2))
        self.df['year_week'] = self.df['year'].astype(str) + '_' + self.df['week'].map(lambda x: str(x).zfill(2))
        self.df['year_quarter'] = self.df['year'].astype(str) + '_' + self.df['quarter'].map(lambda x: str(x).zfill(2))


    def user_rank(self, date_col: str, cols_to_group: list, cols_to_sum: list) -> DataFrame:
        data = nu.df.groupby(cols_to_group)[cols_to_sum].sum().reset_index()
        
        tmp = data.groupby([x for x in cols_to_group if x!=date_col]).size()
        rank = tmp.map(range)
        rank =[item for sublist in rank for item in sublist]
        data['user_rank'] = rank
        data['user_rank'] = data['user_rank'] + 1

        data['is_new'] = data['user_rank'].map(lambda x: 'New' if x==1 else 'Not-new')
        
        tot_dates = data[[date_col]].drop_duplicates().sort_values(date_col).reset_index(drop=True)
        tot_dates.reset_index(inplace=True)
        data = data.merge(tot_dates)
        data.sort_values(cols_to_group, inplace=True)
        data['prev_date'] = data.groupby([x for x in cols_to_group if x!=date_col])['index'].shift()
        data['date_diff'] = data['index'] - data['prev_date']
        data['date_diff_group'] = data['date_diff'].map(lambda x: 3 if x>=3 else x)
        return data


    def summary(self, data: DataFrame, **kwargs) -> DataFrame:
        data = data.pivot_table(**kwargs)
        data.columns = ['_'.join(x) for x in data.columns]
        return data.reset_index()

# Transform

## Read files

In [42]:
nu = NewUsers(BASE_DIR, FILE_BASE_NAME)
nu.read_files()
nu.df.head()

Archivo 1/4 con nombre: "nu_2022_07.csv" es importado exitosamente
Archivo 2/4 con nombre: "nu_2022_H1.csv" es importado exitosamente
Archivo 3/4 con nombre: "nu_2021.csv" es importado exitosamente
Archivo 4/4 con nombre: "nu_2020.csv" es importado exitosamente


Unnamed: 0.1,Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,file,store_id,store,store_is_active,store_category_id,store_category,store_family,store_type,store_group
0,1,2319865,22,2022-W27,19,nu_2022_07.csv,22.0,Chedraui,True,13.0,Supermercados,Supermercados,Groceries,Supermarket
1,2,16775877,2648,2022-W27,19,nu_2022_07.csv,2648.0,Circle K,True,14.0,Conveniencia,Express,Convenience,Express
2,3,2966456,22,2022-W26,17,nu_2022_07.csv,22.0,Chedraui,True,13.0,Supermercados,Supermercados,Groceries,Supermarket
3,4,2966456,22,2022-W27,14,nu_2022_07.csv,22.0,Chedraui,True,13.0,Supermercados,Supermercados,Groceries,Supermarket
4,5,10705824,1816,2022-W27,13,nu_2022_07.csv,1816.0,Healthy boutique,True,25.0,Vida sana,Gourmet & Vida sana,Self-service,Rest of NonG


# Filtrar audiencia

In [43]:
filtrar = read_csv(nu.base_dir.joinpath('audiencia_cg.csv'))
filtrar.head()

Unnamed: 0,Id,Name,Email,User ID,Company name
0,564d47154377bb52840007a0,Alexandra Sánchez Guerra Marquez,asguerram@gmail.com,3882,
1,564eb94b8cab3ab65f0013b6,Raul Pous Facha,raulpousf@gmail.com,30020,
2,564fa6982bca6a4bc90000fb,Lizette Camila Melendez,kmila782696@hotmail.com,36908,
3,564fc452065f0d959900064e,Andrea Tapia,tapiaandrea.at@gmail.com,42813,
4,566041c364f4e70fdb00008a,María doval,bicha75@hotmail.com,55752,


In [44]:
nu.df = nu.df[(nu.df['store_id']==7)&(nu.df['User ID'].isin(filtrar['User ID']))].copy()

## Date partition

In [45]:
# nu.get_date_partitions(date_col='Delivered at Local Time Dynamic')
# nu.df.head(11)

In [46]:
# nu.df.pivot_table(index='month', columns='year', values='Orders Count', aggfunc=sum)

## User rank

In [47]:
rank_store = nu.user_rank(date_col='Delivered at Local Time Dynamic', cols_to_group=['User ID', 'Store ID', 'Delivered at Local Time Dynamic'], cols_to_sum=['Orders Count'])
rank_store.head(11)

Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,user_rank,is_new,index,prev_date,date_diff,date_diff_group
0,367,7,2020-W12,1,1,New,11,,,
466,367,7,2020-W13,2,2,Not-new,12,11.0,1.0,1.0
950,367,7,2020-W15,1,3,Not-new,14,12.0,2.0,2.0
1535,367,7,2020-W16,1,4,Not-new,15,14.0,1.0,1.0
2234,367,7,2020-W18,1,5,Not-new,17,15.0,2.0,2.0
2955,367,7,2020-W20,1,6,Not-new,19,17.0,2.0,2.0
3671,367,7,2020-W22,1,7,Not-new,21,19.0,2.0,2.0
4411,367,7,2020-W23,1,8,Not-new,22,21.0,1.0,1.0
5093,367,7,2020-W25,1,9,Not-new,24,22.0,2.0,2.0
5765,367,7,2020-W32,1,10,Not-new,31,24.0,7.0,3.0


# Summary

## by store

In [48]:
summary_store = nu.summary(data=rank_store, index=['Store ID', 'Delivered at Local Time Dynamic'], columns='is_new', aggfunc={'User ID':Series.nunique, 'Orders Count':'sum'})
summary_store = summary_store.merge(nu.stores, left_on='Store ID', right_on='store_id')
summary_store.head(11)

Unnamed: 0,Store ID,Delivered at Local Time Dynamic,Orders Count_New,Orders Count_Not-new,User ID_New,User ID_Not-new,store_id,store,store_is_active,store_category_id,store_category,store_family,store_type,store_group
0,7,2020-W01,100.0,,97.0,,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
1,7,2020-W02,258.0,28.0,235.0,20.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
2,7,2020-W03,136.0,107.0,128.0,89.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
3,7,2020-W04,109.0,141.0,104.0,122.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
4,7,2020-W05,80.0,134.0,79.0,113.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
5,7,2020-W06,69.0,166.0,69.0,148.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
6,7,2020-W07,72.0,181.0,70.0,158.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
7,7,2020-W08,77.0,169.0,74.0,152.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
8,7,2020-W09,60.0,247.0,57.0,207.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket
9,7,2020-W10,63.0,212.0,62.0,179.0,7,Costco,True,13,Supermercados,Supermercados,Groceries,Supermarket


## by store-family

In [49]:
# rank_storefam = nu.user_rank(date_col='year_month', cols_to_group=['User ID', 'store_family', 'year_month'], cols_to_sum=['Orders Count'])
# rank_storefam.head(11)

In [50]:
# summary_storefam = nu.summary(data=rank_storefam, index=['store_family', 'year_month'], columns='is_new', aggfunc={'User ID':Series.nunique, 'Orders Count':'sum'})
# summary_storefam.head(11)

# Export

In [51]:
summary_store.to_csv(nu.base_dir.joinpath(f'{nu.file_name}_result_store.csv'), sep='\t', encoding='utf-16', index=False)
# summary_storefam.to_csv(nu.base_dir.joinpath(f'{nu.file_name}_result_storefam.csv'), sep='\t', encoding='utf-16', index=False)