# Parameters

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Requests'
FILE_NAME = 'custom_products.csv'

# Code

In [2]:
from pathlib import Path
from IPython.display import display

from numpy import nan
from emoji import demojize
from re import sub, UNICODE
from unicodedata import normalize
from pandas import DataFrame, ExcelWriter, read_csv

from sklearn.feature_extraction.text import CountVectorizer

class CustomProducts:
    def __init__(self, base_dir: str, csv_file: str, stop_words: list=[]) -> None:
        self.stopwords = stop_words
        self.base_dir = Path(base_dir)
        self.csv_path = self.base_dir.joinpath(csv_file)
        self.csv_name = ''.join(csv_file.split('.')[:-1])

        if not self.csv_path.is_file():
            print(f'There should be a file called: {self.csv_name} at:\n{self.base_dir}\n\nAdd it and try again!')

    def clean_text(self, text: str, rem_stop: list, pattern: str="[^a-zA-Z\s]", lower: bool=True, emoji: bool=True, to_singular: bool=True) -> str: 
        '''
        Limpieza de texto
        '''
        # "Traduce" emojis, ej: 🇲🇽 --> :Mexico:
        if emoji: text = demojize(text)

        # Reemplazar acentos: áàäâã --> a
        clean = normalize('NFD', str(text).replace('\n', ' \n ')).encode('ascii', 'ignore')
        # Omitir caracteres especiales !"#$%&/()=...
        clean = sub(pattern, ' ', clean.decode('utf-8'), flags=UNICODE)

        # Mantener sólo un espacio
        clean = sub(r'\s{2,}', ' ', clean.strip())

        # Minúsculas si el parámetro lo indica
        if lower: clean = clean.lower()
        # Omitir la última "s" si el parámetro lo indica
        if to_singular: clean = sub('s\s',' ',clean+' ')

        # Omitir las stopwords indicadas
        clean = ' '.join([x for x in clean.split() if x not in rem_stop])

        # Si el registro estaba vacío, indicar nulo
        if clean in ('','nan'): clean = nan
        return clean

    def get_top(self, df: DataFrame, col: str='description', top_n=100, **kwargs) -> DataFrame:
        df[f'{col}_clean'] = df[col].map(lambda x: self.clean_text(str(x), rem_stop=self.stopwords)).fillna('')
        cv = CountVectorizer(max_features=top_n, **kwargs)
        cv_fit = cv.fit_transform(df[f'{col}_clean'])
        top = dict(zip(cv.get_feature_names(), cv_fit.toarray().sum(axis=0)))
        top = DataFrame(top, index=['word_count']).T.sort_values('word_count', ascending=False)
        return top

    def summary(self, top_n: int=50, id_col: str='order_id', name_col: str='custom_product', found_col: str='was_found', replaced_col: str='was_replaced', replaced_by_col: str='replaced_by', custom_found_col: str='quantity_found', total_requested_col: str='qty_products_found', **kwargs) -> None:
        df = read_csv(self.csv_path)

        n_orders = len(df[id_col].drop_duplicates())
        no_dupli_orders = df.sort_values(name_col).drop_duplicates(id_col)

        non_custom = no_dupli_orders[name_col].isnull().sum()
        tot_custom = n_orders - non_custom
        perc_custom = "{:.2%}".format(1-non_custom/n_orders)
        print(f'\nSummary:\n\nTotal orders:\t{n_orders}\nTotal custom:\t{tot_custom}\n% orders with custom products:\t{perc_custom}')

        df = df[df[name_col].notnull()].copy()
        n_custom = len(df)

        print(f'% custom products / products requested:\t{"{:.2%}".format(df[custom_found_col].sum()/df[total_requested_col].sum())}\n')

        df_replaced = df[df[replaced_col]].copy()
        df_found = df[df[found_col]].copy()
        df_not_replaced = df_found[df_found[replaced_col]==False].copy()
        df_not_found = df[df[found_col]==False].copy()
        
        perc_replaced = "{:.2%}".format(len(df_replaced)/n_custom)
        perc_not_replaced = "{:.2%}".format(len(df_not_replaced)/n_custom)
        perc_not_found = "{:.2%}".format(len(df_not_found)/n_custom)

        writer = ExcelWriter(self.base_dir.joinpath(f'{self.csv_name}.xlsx'), engine='xlsxwriter')
        print(f'From the {n_custom} custom products:\n- {len(df_replaced)} ({perc_replaced}) were replaced\n- {len(df_not_replaced)} ({perc_not_replaced}) were not replaced but found\n- {len(df_not_found)} ({perc_not_found}) were not found\n')        

        for name, col, data in zip(['replaced','not replaced','not found'], [replaced_by_col,name_col,name_col], [df_replaced, df_not_replaced, df_not_found]):
            print(f'\nTOP 5 from {name} products:')
            df_top = self.get_top(data, col=col, top_n=top_n, **kwargs)
            display(df_top.head(5))
            df_top.to_excel(writer, sheet_name=name)
        writer.save()
        print(f'\nAn Excel file was exported succesfully at:\n{self.base_dir}\nEach sheet contains the top {top_n} products')
        return df

# Summary

In [3]:
STOPWORDS = [
    'de', 'la', 'que', 'con', 'sin', 'en', 'lo', 'el', 'la', 'un', 'una', 'si', 'no', 'se', 'do',
    'pieza', 'kilo', 'kg', 'gr',
    'para', 're', 'por', 'favor', 'hay', 'muy', 'gracia',
]

cp = CustomProducts(BASE_DIR, FILE_NAME, STOPWORDS)
df = cp.summary(ngram_range=(1,3))


Summary:

Total orders:	432305
Total custom:	100316
% orders with custom products:	23.20%
% custom products / products requested:	5.90%

From the 158656 custom products:
- 79449 (50.08%) were replaced
- 51907 (32.72%) were not replaced but found
- 27300 (17.21%) were not found


TOP 5 from replaced products:


Unnamed: 0,word_count
pollo,4366
pechuga,3167
organico,2476
manzana,2130
tortilla,2091



TOP 5 from not replaced products:


Unnamed: 0,word_count
pechuga,2471
pollo,1994
pavo,1931
pechuga pavo,1501
tortilla,1483



TOP 5 from not found products:


Unnamed: 0,word_count
marca,1733
paquete,1658
pechuga,1499
bolsa,1328
pavo,1257



An Excel file was exported succesfully at:
/Users/efraflores/Desktop/EF/Corner/Requests
Each sheet contains the top 50 products
