# Parameters

In [4]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Requests'
FILE_NAME = 'custom_products_convenience.csv'

# Code

In [5]:
from pathlib import Path
from IPython.display import display

from numpy import nan
from emoji import demojize
from re import sub, UNICODE
from unicodedata import normalize
from pandas import DataFrame, ExcelWriter, read_csv

from sklearn.feature_extraction.text import CountVectorizer

class CustomProducts:
    def __init__(self, base_dir: str, csv_file: str, stop_words: list=[]) -> None:
        self.stopwords = stop_words
        self.base_dir = Path(base_dir)
        self.csv_path = self.base_dir.joinpath(csv_file)
        self.csv_name = ''.join(csv_file.split('.')[:-1])

        if not self.csv_path.is_file():
            print(f'There should be a file called: {self.csv_name} at:\n{self.base_dir}\n\nAdd it and try again!')

    def clean_text(self, text: str, rem_stop: list, pattern: str="[^a-zA-Z\s]", lower: bool=True, emoji: bool=True, to_singular: bool=True) -> str: 
        '''
        Limpieza de texto
        '''
        # "Traduce" emojis, ej: 🇲🇽 --> :Mexico:
        if emoji: text = demojize(text)

        # Reemplazar acentos: áàäâã --> a
        clean = normalize('NFD', str(text).replace('\n', ' \n ')).encode('ascii', 'ignore')
        # Omitir caracteres especiales !"#$%&/()=...
        clean = sub(pattern, ' ', clean.decode('utf-8'), flags=UNICODE)

        # Mantener sólo un espacio
        clean = sub(r'\s{2,}', ' ', clean.strip())

        # Minúsculas si el parámetro lo indica
        if lower: clean = clean.lower()
        # Omitir la última "s" si el parámetro lo indica
        if to_singular: clean = sub('s\s',' ',clean+' ')

        # Omitir las stopwords indicadas
        clean = ' '.join([x for x in clean.split() if x not in rem_stop])

        # Si el registro estaba vacío, indicar nulo
        if clean in ('','nan'): clean = nan
        return clean

    def get_top(self, df: DataFrame, col: str='description', top_n=100, **kwargs) -> DataFrame:
        df[f'{col}_clean'] = df[col].map(lambda x: self.clean_text(str(x), rem_stop=self.stopwords)).fillna('')
        cv = CountVectorizer(max_features=top_n, **kwargs)
        cv_fit = cv.fit_transform(df[f'{col}_clean'])
        top = dict(zip(cv.get_feature_names(), cv_fit.toarray().sum(axis=0)))
        top = DataFrame(top, index=['word_count']).T.sort_values('word_count', ascending=False)
        return top

    def summary(self, store_type_col: str='store_type', store_col: str='store', top_n: int=50, id_col: str='order_id', name_col: str='custom_product', found_col: str='was_found', replaced_col: str='was_replaced', replaced_by_col: str='replaced_by', custom_found_col: str='quantity_found', total_requested_col: str='qty_products_found', **kwargs) -> None:
        data = read_csv(self.csv_path)
        writer = ExcelWriter(self.base_dir.joinpath(f'{self.csv_name}.xlsx'), engine='xlsxwriter')

        for store_type in set(data[store_type_col]):
            print(store_type.upper())
            sub_data = data[data[store_type_col]==store_type].copy()

            for store in set(sub_data[store_col]):
                df = sub_data[sub_data[store_col]==store].copy()

                n_orders = len(df[id_col].drop_duplicates())
                no_dupli_orders = df.sort_values(name_col).drop_duplicates(id_col)

                non_custom = no_dupli_orders[name_col].isnull().sum()
                tot_custom = n_orders - non_custom
                perc_custom = "{:.2%}".format(1-non_custom/n_orders)
                print('\n'*2,'-'*44,'\n'*2)
                print(f'{store_type.title()} summary from {store}:\n\nTotal orders:\t{n_orders}\nTotal custom:\t{tot_custom}\n% orders with custom products:\t{perc_custom}')

                df = df[df[name_col].notnull()].copy()
                n_custom = len(df)

                print(f'% custom products / products requested:\t{"{:.2%}".format(df[custom_found_col].sum()/df[total_requested_col].sum())}\n')

                df_replaced = df[df[replaced_col]].copy()
                df_replaced_by_catalog = df_replaced[df_replaced[replaced_by_col].notnull()].copy()
                df_found = df[df[found_col]].copy()
                df_not_replaced = df_found[df_found[replaced_col]==False].copy()
                df_not_found = df[df[found_col]==False].copy()

                perc_replaced = len(df_replaced)/n_custom
                perc_replaced_by_catalog = len(df_replaced_by_catalog)/n_custom
                perc_replaced_not_by_catalog = perc_replaced - perc_replaced_by_catalog
                perc_not_replaced = len(df_not_replaced)/n_custom
                perc_not_found = len(df_not_found)/n_custom

                perc_replaced = "{:.2%}".format(perc_replaced)
                perc_replaced_by_catalog = "{:.2%}".format(perc_replaced_by_catalog)
                perc_replaced_not_by_catalog = "{:.2%}".format(perc_replaced_not_by_catalog)
                perc_not_replaced = "{:.2%}".format(perc_not_replaced)
                perc_not_found = "{:.2%}".format(perc_not_found)

                print(f'From the {n_custom} custom products:\n- {len(df_replaced)} ({perc_replaced}) were replaced\n  - {len(df_replaced_by_catalog)} ({perc_replaced_by_catalog}) had at least an option in our catalog\n  - {len(df_replaced)-len(df_replaced_by_catalog)} ({perc_replaced_not_by_catalog}) had no options in our catalog\n- {len(df_not_replaced)} ({perc_not_replaced}) were not replaced but found\n- {len(df_not_found)} ({perc_not_found}) were not found\n')        

                for name, col, data in zip(['suggested','custom_products'], [replaced_by_col,name_col], [df_replaced, df_not_found]):
                    print(f'\nTOP 5 from {store_type}: {store} {name} products:')
                    df_top = self.get_top(data, col=col, top_n=top_n, **kwargs)
                    display(df_top.head(5))
                    df_top.to_excel(writer, sheet_name=f'{store_type[:4]}_{store.replace(" - ","")}_{name}'[:31])
        
        writer.save()
        print(f'\nAn Excel file was exported succesfully at:\n{self.base_dir}\nEach sheet contains the top {top_n} products')
        return df

# Summary

In [6]:
STOPWORDS = [
    'de', 'la', 'que', 'con', 'sin', 'en', 'lo', 'el', 'la', 'un', 'una', 'si', 'no', 'se', 'do',
    'pieza', 'kilo', 'kg', 'gr',
    'para', 're', 'por', 'favor', 'hay', 'muy', 'gracia',
]

cp = CustomProducts(BASE_DIR, FILE_NAME, STOPWORDS)
df = cp.summary(ngram_range=(1,3))

CONVENIENCE


 -------------------------------------------- 


Convenience summary from 2712 - Extra:

Total orders:	2561
Total custom:	308
% orders with custom products:	12.03%
% custom products / products requested:	49.06%

From the 374 custom products:
- 150 (40.11%) were replaced
  - 150 (40.11%) had at least an option in our catalog
  - 0 (0.00%) had no options in our catalog
- 179 (47.86%) were not replaced but found
- 45 (12.03%) were not found


TOP 5 from Convenience: 2712 - Extra suggested products:


Unnamed: 0,word_count
cerveza,41
clara,20
cerveza clara,20
sabor,14
cigarro,12



TOP 5 from Convenience: 2712 - Extra custom_products products:


Unnamed: 0,word_count
litro,4
coca cola,4
bolsa,4
rojo,4
lata,4




 -------------------------------------------- 


Convenience summary from 2648 - Circle K:

Total orders:	6589
Total custom:	588
% orders with custom products:	8.92%
% custom products / products requested:	64.81%

From the 706 custom products:
- 284 (40.23%) were replaced
  - 284 (40.23%) had at least an option in our catalog
  - 0 (0.00%) had no options in our catalog
- 335 (47.45%) were not replaced but found
- 87 (12.32%) were not found


TOP 5 from Convenience: 2648 - Circle K suggested products:


Unnamed: 0,word_count
cerveza,89
cigarro,78
red,54
filtro,53
cigarro filtro,53



TOP 5 from Convenience: 2648 - Circle K custom_products products:


Unnamed: 0,word_count
cajetilla,16
cigarro,13
marlboro,11
rojo,7
marlboro rojo,6




 -------------------------------------------- 


Convenience summary from 2902 - OXXO:

Total orders:	6300
Total custom:	822
% orders with custom products:	13.05%
% custom products / products requested:	40.34%

From the 984 custom products:
- 267 (27.13%) were replaced
  - 267 (27.13%) had at least an option in our catalog
  - 0 (0.00%) had no options in our catalog
- 550 (55.89%) were not replaced but found
- 167 (16.97%) were not found


TOP 5 from Convenience: 2902 - OXXO suggested products:


Unnamed: 0,word_count
sabor,40
chocolate,29
cerveza,25
fresa,14
clara,12



TOP 5 from Convenience: 2902 - OXXO custom_products products:


Unnamed: 0,word_count
cajetilla,44
marlboro,44
rojo,30
cigarro,26
marlboro rojo,24



An Excel file was exported succesfully at:
/Users/efraflores/Desktop/EF/Corner/Requests
Each sheet contains the top 50 products
