# Parameters

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Requests/new_users_at_store'
FILE_BASE_NAME = 'new_users_at_store'

# Code

In [2]:
# Control de datos
from pathlib import Path

# Ingeniería de variables
from re import search as re_search
from pandas import DataFrame, Series, read_csv, to_datetime

class NewUsers:
    def __init__(self, base_dir: str, file_base_name: str) -> None:
        # Convierte el texto a objeto tipo Path para unir directorios, buscar archivos, etc
        self.base_dir = Path(base_dir)
        # Asigna el nombre base como atributo 
        self.file_name = file_base_name
        # Encuentra todos los archivos que comiencen con el nombre base en el directorio
        self.files_list = [x for x in self.base_dir.glob('*') if re_search(f'{self.file_name}_(?!result).+\.csv', str(x))]

    
    def read_files(self) -> DataFrame:
        '''
        Une todos los archivos que comienzan con el nombre base
        '''
        # Tabla vacía para ir depositando los csv
        self.df = DataFrame()
        # Obtiene el número de archivos, sólo es informativo
        total_files = len(self.files_list)

        for i,file_chunk in enumerate(self.files_list):
            # Obtener sólo el nombre del archivo, no su ubicación completa
            sub_name = str(file_chunk).split('/')[-1]
            aux = read_csv(file_chunk)
            aux['file'] = sub_name
            # Une la tabla anterior con el nuevo archivo
            self.df = self.df.append(aux, ignore_index=True)
            
            # Informa al usuario del avance
            print(f'Archivo {i+1}/{total_files} con nombre: "{sub_name}" es importado exitosamente')


    def get_date_partitions(self, date_col: str='Delivered at Local Time Dynamic')-> None:
        self.df[date_col] = to_datetime(self.df[date_col])
        self.df['year'] = self.df[date_col].dt.year
        self.df['month'] = self.df[date_col].dt.month
        self.df['quarter'] = self.df[date_col].dt.quarter
        self.df['year_month'] = self.df['year'].astype(str) + '_' + self.df['month'].map(lambda x: str(x).zfill(2))
        self.df['year_quarter'] = self.df['year'].astype(str) + '_' + self.df['quarter'].map(lambda x: str(x).zfill(2))


    def user_rank(self, date_col: str, cols_to_group: list, cols_to_sum: list) -> None:
        self.df = self.df.groupby(cols_to_group)[cols_to_sum].sum().reset_index()
        tmp = self.df.groupby([x for x in cols_to_group if x!=date_col]).size()
        rank = tmp.map(range)
        rank =[item for sublist in rank for item in sublist]
        self.df['user_rank'] = rank
        self.df['user_rank'] = self.df['user_rank'] + 1

        self.df['is_new'] = self.df['user_rank'].map(lambda x: 'New' if x==1 else 'Not-new')
        
        tot_dates = self.df[[date_col]].drop_duplicates().sort_values(date_col).reset_index(drop=True)
        tot_dates.reset_index(inplace=True)
        self.df = self.df.merge(tot_dates)
        self.df.sort_values(cols_to_group, inplace=True)
        self.df['prev_date'] = self.df.groupby([x for x in cols_to_group if x!=date_col])['index'].shift()
        self.df['date_diff'] = self.df['index'] - self.df['prev_date']
        self.df['date_diff_group'] = self.df['date_diff'].map(lambda x: 3 if x>=3 else x)


# Transform

## Read files

In [3]:
nu = NewUsers(BASE_DIR, FILE_BASE_NAME)
nu.read_files()
nu.df.head(11)

Archivo 1/3 con nombre: "new_users_at_store_2021.csv" es importado exitosamente
Archivo 2/3 con nombre: "new_users_at_store_2020.csv" es importado exitosamente
Archivo 3/3 con nombre: "new_users_at_store_2022.csv" es importado exitosamente


Unnamed: 0.1,Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,file
0,1,58,22,2021-11,79,new_users_at_store_2021.csv
1,2,1276465,22,2021-10,79,new_users_at_store_2021.csv
2,3,4825433,7,2021-04,73,new_users_at_store_2021.csv
3,4,58,22,2021-12,70,new_users_at_store_2021.csv
4,5,1492255,22,2021-02,68,new_users_at_store_2021.csv
5,6,493678,22,2021-10,68,new_users_at_store_2021.csv
6,7,293276,9,2021-09,68,new_users_at_store_2021.csv
7,8,58,22,2021-10,68,new_users_at_store_2021.csv
8,9,1276465,22,2021-12,68,new_users_at_store_2021.csv
9,10,293276,9,2021-11,67,new_users_at_store_2021.csv


## Date partition

In [4]:
nu.get_date_partitions()
nu.df.head(11)

Unnamed: 0.1,Unnamed: 0,User ID,Store ID,Delivered at Local Time Dynamic,Orders Count,file,year,month,quarter,year_month,year_quarter
0,1,58,22,2021-11-01,79,new_users_at_store_2021.csv,2021,11,4,2021_11,2021_04
1,2,1276465,22,2021-10-01,79,new_users_at_store_2021.csv,2021,10,4,2021_10,2021_04
2,3,4825433,7,2021-04-01,73,new_users_at_store_2021.csv,2021,4,2,2021_04,2021_02
3,4,58,22,2021-12-01,70,new_users_at_store_2021.csv,2021,12,4,2021_12,2021_04
4,5,1492255,22,2021-02-01,68,new_users_at_store_2021.csv,2021,2,1,2021_02,2021_01
5,6,493678,22,2021-10-01,68,new_users_at_store_2021.csv,2021,10,4,2021_10,2021_04
6,7,293276,9,2021-09-01,68,new_users_at_store_2021.csv,2021,9,3,2021_09,2021_03
7,8,58,22,2021-10-01,68,new_users_at_store_2021.csv,2021,10,4,2021_10,2021_04
8,9,1276465,22,2021-12-01,68,new_users_at_store_2021.csv,2021,12,4,2021_12,2021_04
9,10,293276,9,2021-11-01,67,new_users_at_store_2021.csv,2021,11,4,2021_11,2021_04


## User rank

In [5]:
nu.user_rank(date_col='year_month', cols_to_group=['User ID', 'Store ID', 'year_month'], cols_to_sum=['Orders Count'])
nu.df.head(11)

Unnamed: 0,User ID,Store ID,year_month,Orders Count,user_rank,is_new,index,prev_date,date_diff,date_diff_group
0,6,5,2020_01,1,1,New,0,,,
78648,6,5,2020_05,2,2,Not-new,4,0.0,4.0,3.0
360765,6,7,2020_04,1,1,New,3,,,
590655,6,9,2020_09,1,1,New,8,,,
796247,6,9,2020_10,6,2,Not-new,9,8.0,1.0,1.0
1002457,6,9,2020_11,5,3,Not-new,10,9.0,1.0,1.0
1201376,6,9,2020_12,5,4,Not-new,11,10.0,1.0,1.0
1439900,6,9,2021_01,1,5,Not-new,12,11.0,1.0,1.0
1689693,6,9,2021_02,3,6,Not-new,13,12.0,1.0,1.0
1929717,6,9,2021_03,3,7,Not-new,14,13.0,1.0,1.0


# Summary

In [6]:
summary = nu.df.pivot_table(index=['Store ID', 'year_month'], columns='is_new', aggfunc={'User ID':Series.nunique, 'Orders Count':'sum'})
summary.head(11)

Unnamed: 0_level_0,Unnamed: 1_level_0,Orders Count,Orders Count,User ID,User ID
Unnamed: 0_level_1,is_new,New,Not-new,New,Not-new
Store ID,year_month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
5,2020_01,20668.0,,10295.0,
5,2020_02,6028.0,14077.0,4636.0,5439.0
5,2020_03,11122.0,18571.0,8484.0,7191.0
5,2020_04,23221.0,23722.0,15865.0,9693.0
5,2020_05,23180.0,41666.0,16074.0,16196.0
5,2020_06,8211.0,33658.0,6605.0,16783.0
6,2020_01,17474.0,,10710.0,
6,2020_02,7281.0,10026.0,5994.0,4521.0
6,2020_03,14921.0,13675.0,11928.0,6490.0
6,2020_04,34397.0,20980.0,24704.0,10070.0


In [7]:
summary.to_csv(nu.base_dir.joinpath(f'{nu.file_name}_result.csv'))