# Parameters

In [1]:
BASE_DIR = '/Users/efrain.flores/Desktop/EF/Corner/Requests/new_users_at_store'
FILE_BASE_NAME = 'new_users_at_store'

# Code

In [2]:
# Control de datos
from pathlib import Path

# Ingeniería de variables
from re import search as re_search
from pandas import DataFrame, Series, read_csv, to_datetime

class NewUsers:
    def __init__(self, base_dir: str, file_base_name: str) -> None:
        # Convierte el texto a objeto tipo Path para unir directorios, buscar archivos, etc
        self.base_dir = Path(base_dir)
        # Asigna el nombre base como atributo 
        self.file_name = file_base_name
        # Encuentra todos los archivos que comiencen con el nombre base en el directorio
        self.files_list = [x for x in self.base_dir.glob('*') if re_search(f'{self.file_name}_(?!result).+\.csv', str(x))]
        self.stores_file = self.base_dir.joinpath('stores.csv')

    
    def read_files(self) -> DataFrame:
        '''
        Une todos los archivos que comienzan con el nombre base
        '''
        # Tabla vacía para ir depositando los csv
        self.df = DataFrame()
        # Obtiene el número de archivos, sólo es informativo
        total_files = len(self.files_list)

        for i,file_chunk in enumerate(self.files_list):
            # Obtener sólo el nombre del archivo, no su ubicación completa
            sub_name = str(file_chunk).split('/')[-1]
            aux = read_csv(file_chunk)
            aux['file'] = sub_name
            # Une la tabla anterior con el nuevo archivo
            self.df = self.df.append(aux, ignore_index=True)
            
            # Informa al usuario del avance
            print(f'Archivo {i+1}/{total_files} con nombre: "{sub_name}" es importado exitosamente')

        self.stores = read_csv(self.stores_file)
        self.df = self.df.merge(self.stores, left_on='Store ID', right_on='store_id', how='left')


    def get_date_partitions(self, date_col: str='Delivered at Local Time Dynamic')-> None:
        self.df[date_col] = to_datetime(self.df[date_col])
        self.df['year'] = self.df[date_col].dt.year
        self.df['month'] = self.df[date_col].dt.month
        self.df['quarter'] = self.df[date_col].dt.quarter
        self.df['year_month'] = self.df['year'].astype(str) + '_' + self.df['month'].map(lambda x: str(x).zfill(2))
        self.df['year_quarter'] = self.df['year'].astype(str) + '_' + self.df['quarter'].map(lambda x: str(x).zfill(2))


    def user_rank(self, date_col: str, cols_to_group: list, cols_to_sum: list) -> DataFrame:
        data = nu.df.groupby(cols_to_group)[cols_to_sum].sum().reset_index()
        tmp = data.groupby([x for x in cols_to_group if x!=date_col]).size()
        rank = tmp.map(range)
        rank =[item for sublist in rank for item in sublist]
        data['user_rank'] = rank
        data['user_rank'] = data['user_rank'] + 1

        data['is_new'] = data['user_rank'].map(lambda x: 'New' if x==1 else 'Not-new')
        
        tot_dates = data[[date_col]].drop_duplicates().sort_values(date_col).reset_index(drop=True)
        tot_dates.reset_index(inplace=True)
        data = data.merge(tot_dates)
        data.sort_values(cols_to_group, inplace=True)
        data['prev_date'] = data.groupby([x for x in cols_to_group if x!=date_col])['index'].shift()
        data['date_diff'] = data['index'] - data['prev_date']
        data['date_diff_group'] = data['date_diff'].map(lambda x: 3 if x>=3 else x)
        return data


    def summary(self, data: DataFrame, **kwargs) -> DataFrame:
        data = data.pivot_table(**kwargs)
        data.columns = ['_'.join(x) for x in data.columns]
        return data.reset_index()

# Transform

## Read files

In [3]:
nu = NewUsers(BASE_DIR, FILE_BASE_NAME)
nu.read_files()
nu.df.head()

Archivo 1/3 con nombre: "new_users_at_store_2020.csv" es importado exitosamente
Archivo 2/3 con nombre: "new_users_at_store_2022.csv" es importado exitosamente
Archivo 3/3 con nombre: "new_users_at_store_2022_05.csv" es importado exitosamente


Unnamed: 0.1,Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,file,store_id,store,store_is_active,store_category_id,store_category,store_family,store_type,store_group
0,1,970362,9,2020-06,87,new_users_at_store_2020.csv,9,City Market,True,13,Supermercados,Supermercados,Groceries,Supermarket
1,2,970362,9,2020-07,74,new_users_at_store_2020.csv,9,City Market,True,13,Supermercados,Supermercados,Groceries,Supermarket
2,3,1129947,25,2020-01,71,new_users_at_store_2020.csv,25,HEB,True,13,Supermercados,Supermercados,Groceries,Supermarket
3,4,1492255,22,2020-12,66,new_users_at_store_2020.csv,22,Chedraui,True,13,Supermercados,Supermercados,Groceries,Supermarket
4,5,970362,9,2020-08,65,new_users_at_store_2020.csv,9,City Market,True,13,Supermercados,Supermercados,Groceries,Supermarket


## Date partition

In [4]:
nu.get_date_partitions()
nu.df.head(11)

Unnamed: 0.1,Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,file,store_id,store,store_is_active,store_category_id,store_category,store_family,store_type,store_group,year,month,quarter,year_month,year_quarter
0,1,970362,9,2020-06-01,87,new_users_at_store_2020.csv,9,City Market,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,6,2,2020_06,2020_02
1,2,970362,9,2020-07-01,74,new_users_at_store_2020.csv,9,City Market,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,7,3,2020_07,2020_03
2,3,1129947,25,2020-01-01,71,new_users_at_store_2020.csv,25,HEB,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,1,1,2020_01,2020_01
3,4,1492255,22,2020-12-01,66,new_users_at_store_2020.csv,22,Chedraui,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,12,4,2020_12,2020_04
4,5,970362,9,2020-08-01,65,new_users_at_store_2020.csv,9,City Market,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,8,3,2020_08,2020_03
5,6,1129947,25,2020-02-01,61,new_users_at_store_2020.csv,25,HEB,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,2,1,2020_02,2020_01
6,7,2476891,22,2020-05-01,61,new_users_at_store_2020.csv,22,Chedraui,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,5,2,2020_05,2020_02
7,8,58,22,2020-02-01,58,new_users_at_store_2020.csv,22,Chedraui,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,2,1,2020_02,2020_01
8,9,2476891,22,2020-10-01,57,new_users_at_store_2020.csv,22,Chedraui,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,10,4,2020_10,2020_04
9,10,2207520,22,2020-08-01,56,new_users_at_store_2020.csv,22,Chedraui,True,13,Supermercados,Supermercados,Groceries,Supermarket,2020,8,3,2020_08,2020_03


## User rank

In [5]:
rank_store = nu.user_rank(date_col='year_month', cols_to_group=['User ID', 'Store ID', 'year_month'], cols_to_sum=['Orders Count'])
rank_store.head(11)

Unnamed: 0,User ID,Store ID,year_month,Orders Count,user_rank,is_new,index,prev_date,date_diff,date_diff_group
0,6,5,2020_01,1,1,New,0,,,
78648,6,5,2020_05,2,2,Not-new,4,0.0,4.0,3.0
360765,6,7,2020_04,1,1,New,3,,,
590655,6,9,2020_09,1,1,New,8,,,
796247,6,9,2020_10,6,2,Not-new,9,8.0,1.0,1.0
1002457,6,9,2020_11,5,3,Not-new,10,9.0,1.0,1.0
1201376,6,9,2020_12,5,4,Not-new,11,10.0,1.0,1.0
1439900,6,9,2022_01,1,5,Not-new,12,11.0,1.0,1.0
1745150,6,9,2022_03,1,6,Not-new,14,12.0,2.0,2.0
2033450,6,9,2022_04,2,7,Not-new,15,14.0,1.0,1.0


# Summary

## by store

In [6]:
summary_store = nu.summary(data=rank_store, index=['Store ID', 'year_month'], columns='is_new', aggfunc={'User ID':Series.nunique, 'Orders Count':'sum'})
summary_store = summary_store.merge(nu.stores, left_on='Store ID', right_on='store_id')
summary_store.head(11)

Unnamed: 0,Store ID,year_month,Orders Count_New,Orders Count_Not-new,User ID_New,User ID_Not-new,store_id,store,store_is_active,store_category_id,store_category,store_family,store_type,store_group
0,5,2020_01,20668.0,,10295.0,,5,Superama,False,13,Supermercados,Supermercados,Groceries,Supermarket
1,5,2020_02,6028.0,14077.0,4636.0,5439.0,5,Superama,False,13,Supermercados,Supermercados,Groceries,Supermarket
2,5,2020_03,11122.0,18571.0,8484.0,7191.0,5,Superama,False,13,Supermercados,Supermercados,Groceries,Supermarket
3,5,2020_04,23221.0,23722.0,15865.0,9693.0,5,Superama,False,13,Supermercados,Supermercados,Groceries,Supermarket
4,5,2020_05,23180.0,41666.0,16074.0,16196.0,5,Superama,False,13,Supermercados,Supermercados,Groceries,Supermarket
5,5,2020_06,8211.0,33658.0,6605.0,16783.0,5,Superama,False,13,Supermercados,Supermercados,Groceries,Supermarket
6,6,2020_01,17474.0,,10710.0,,6,Walmart,False,13,Supermercados,Supermercados,Groceries,Supermarket
7,6,2020_02,7281.0,10026.0,5994.0,4521.0,6,Walmart,False,13,Supermercados,Supermercados,Groceries,Supermarket
8,6,2020_03,14921.0,13675.0,11928.0,6490.0,6,Walmart,False,13,Supermercados,Supermercados,Groceries,Supermarket
9,6,2020_04,34397.0,20980.0,24704.0,10070.0,6,Walmart,False,13,Supermercados,Supermercados,Groceries,Supermarket


## by store-family

In [7]:
rank_storefam = nu.user_rank(date_col='year_month', cols_to_group=['User ID', 'store_family', 'year_month'], cols_to_sum=['Orders Count'])
rank_storefam.head(11)

Unnamed: 0,User ID,store_family,year_month,Orders Count,user_rank,is_new,index,prev_date,date_diff,date_diff_group
0,6,Express,2020_12,1,1,New,11,,,
163051,6,Express,2022_03,1,2,Not-new,14,11.0,3.0,3.0
383821,6,Gourmet & Vida sana,2020_04,2,1,New,3,,,
1,6,Gourmet & Vida sana,2020_12,1,2,Not-new,11,3.0,8.0,3.0
531435,6,"Mascotas, Hogar, trabajo",2020_03,2,1,New,2,,,
619678,6,"Mascotas, Hogar, trabajo",2020_06,1,2,Not-new,5,2.0,3.0,3.0
783241,6,"Mascotas, Hogar, trabajo",2022_04,1,3,Not-new,15,5.0,10.0,3.0
984542,6,Supermercados,2020_01,2,1,New,0,,,
1041368,6,Supermercados,2020_02,2,2,Not-new,1,0.0,1.0,1.0
531436,6,Supermercados,2020_03,5,3,Not-new,2,1.0,1.0,1.0


In [8]:
summary_storefam = nu.summary(data=rank_storefam, index=['store_family', 'year_month'], columns='is_new', aggfunc={'User ID':Series.nunique, 'Orders Count':'sum'})
summary_storefam.head(11)

Unnamed: 0,store_family,year_month,Orders Count_New,Orders Count_Not-new,User ID_New,User ID_Not-new
0,"Belleza, Moda & Deportes",2020_01,37.0,,34.0,
1,"Belleza, Moda & Deportes",2020_02,48.0,10.0,40.0,6.0
2,"Belleza, Moda & Deportes",2020_03,81.0,19.0,75.0,13.0
3,"Belleza, Moda & Deportes",2020_04,89.0,11.0,84.0,8.0
4,"Belleza, Moda & Deportes",2020_05,382.0,22.0,350.0,19.0
5,"Belleza, Moda & Deportes",2020_06,507.0,108.0,470.0,78.0
6,"Belleza, Moda & Deportes",2020_07,564.0,146.0,512.0,118.0
7,"Belleza, Moda & Deportes",2020_08,464.0,176.0,428.0,143.0
8,"Belleza, Moda & Deportes",2020_09,421.0,216.0,387.0,176.0
9,"Belleza, Moda & Deportes",2020_10,592.0,300.0,535.0,241.0


# Export

In [10]:
summary_store.to_csv(nu.base_dir.joinpath(f'{nu.file_name}_result_store.csv'), sep='\t', encoding='utf-16', index=False)
summary_storefam.to_csv(nu.base_dir.joinpath(f'{nu.file_name}_result_storefam.csv'), sep='\t', encoding='utf-16', index=False)