## Parámetros

In [1]:
# Directorio de los resultados de la query
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/SinBolsa/chat'
# Nombre base, es decir, puede haber varios archivos con "nombre_0", "nombre_1", etc y el nombre base sería "nombre"
FILE_BASE_NAME = 'chat'
# Palabras para buscar en el chat
WORDS = 'sin bolsa'

## Code

In [2]:
from json import loads
from typing import Dict
from pathlib import Path
from unicodedata import normalize
from re import UNICODE, search, sub, findall
from pandas import DataFrame, read_csv, json_normalize, to_datetime, cut

class ChatNotFound:
    def __init__(self, base_dir: str, file_base_name: str, words: str) -> None:
        self.base_dir = Path(base_dir)
        self.words = words
        self.file_name = file_base_name
        self.files_list = [x for x in self.base_dir.glob('*.csv') if search(f'{self.file_name}_(?!found)', str(x))]

    def __len__(self) -> int:
        return len(self.files_list)
    
    def __str__(self) -> str:
        return f'Directorio: {self.base_dir}\nCon {self.__len__()} archivo(s) para buscar las palabras:\n"{self.words}"'

    def read_files(self, **kwargs) -> DataFrame:
        df = DataFrame()
        if len(self.files_list) == 1: df = read_csv(self.files_list[0], error_bad_lines=False, **kwargs)
        else:
            for file_chunk in self.files_list:
                df = df.append(read_csv(file_chunk, error_bad_lines=False, **kwargs), ignore_index=True)
        return df

    def date_vars(self, df: DataFrame, cols: list=['date']) -> DataFrame:
        for col in cols:
            df[col] = to_datetime(df[col], yearfirst=True)
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[f'{col}_year'].astype(str)+'-'+df[col].dt.month.astype(str).str.zfill(2)
            df[f'{col}_week'] = df[f'{col}_year'].astype(str)+'-'+df[col].dt.isocalendar().week.astype(str).str.zfill(2)        
            df[f'{col}_dayname'] = df[col].dt.day_name().str[:3]
            df[f'{col}_hour'] = df[col].dt.hour.astype(str).str.zfill(2)
            df[f'{col}_hour_range'] = cut(df[col].dt.hour, bins=[-1,8,12,16,20,23])
            df[f'{col}_hour_range'] = df[f'{col}_hour_range'].map(lambda x: str(x.left+1).zfill(2)+' to '+str(x.right).zfill(2))
        return df

    def get_chat(self, x: str) -> Dict:
        expanded_json = json_normalize(loads(x))
        correct_dict = {}
        try: separated_roles = expanded_json.pivot_table(columns='user.metadata.role', aggfunc={'message':'--'.join})
        except: return correct_dict
        for col in separated_roles.columns:
            try: correct_dict[col] = separated_roles.to_dict()[col]['message']
            except: pass
        return correct_dict

    def clean_text(self, text: str, pattern: str="[^a-zA-Z\s]", lower: bool=False, lemma: bool=False, rem_stopw: bool=False, unique: bool=False) -> str:
        cleaned_text = normalize('NFD',str(text).replace('\n',' \n ')).encode('ascii', 'ignore')
        cleaned_text = sub(pattern,' ',cleaned_text.decode('utf-8'), flags=UNICODE)
        cleaned_text = [word for word in (cleaned_text.lower().split() if lower else cleaned_text.split())]
        return ' '.join((set(cleaned_text) if unique else cleaned_text))

    def find_words(self, x, to_find):
        return findall('|'.join(map(lambda x: x.strip().lower(), to_find.split(','))),x)

    def var_msg(self, df: DataFrame, cols: str=['customer','shopper'], **kwargs) -> DataFrame:
        for col in cols:
            df[f'n_msg_{col}'] = df[col].str.split('--').str.len()
            df[f'n_words_{col}'] = df[col].str.replace('--',' ').str.split().str.len()
            df[f'clean_{col}'] = df[col].map(lambda x: self.clean_text(str(x), **kwargs))
            df[f'found_{col}'] = df[f'clean_{col}'].map(lambda x: self.find_words(x, self.words))
            df[f'n_found_{col}'] = df[f'found_{col}'].map(len)
            df.fillna({x:0 for x in df.head(1).filter(like=f'_{col}')}, inplace=True)
        df['found_pattern'] = (df.filter(like='n_found_').sum(axis=1) > 0)*1
        return df
    
    def export_csv(self, df: DataFrame, name_suffix=None, **kwargs) -> None:
        export_name = f'{self.file_name}.csv' if name_suffix==None else f'{self.file_name}_{name_suffix}.csv'
        df.to_csv(self.base_dir.joinpath(export_name), **kwargs)

    def full_pipeline(self) -> DataFrame:
        df = self.read_files()
        df = self.date_vars(df, cols= ['order_date'])
        found = DataFrame(df['messages'].map(self.get_chat).tolist(), index=df.index)
        df = df.join(found).drop('messages', axis=1)
        df = self.var_msg(df, pattern="[^a-zA-Z0-9\s\-]", lower=True)
        self.export_csv(df, name_suffix='found', index=False, sep='\t', encoding='utf-16')
        return df

## Transform

In [3]:
cnf = ChatNotFound(BASE_DIR, FILE_BASE_NAME, WORDS)
print(cnf)

Directorio: /Users/efraflores/Desktop/EF/Corner/SinBolsa/chat
Con 1 archivo(s) para buscar las palabras:
"sin bolsa"


In [4]:
df = cnf.full_pipeline()
df[df['found_pattern']>0].sample()

Unnamed: 0,id,order_date,store_bags,cornershop_bags,total_bags,order_date_year,order_date_month,order_date_week,order_date_dayname,order_date_hour,...,n_words_customer,clean_customer,found_customer,n_found_customer,n_msg_shopper,n_words_shopper,clean_shopper,found_shopper,n_found_shopper,found_pattern
51905,53676548,2022-03-15 15:32:16.179499,0,1,1,2022,2022-03,2022-11,Tue,15,...,6.0,noo muchas gracias--con bolsa porfavor,[],0,8,66,mario eduardo esta en camino con tu pedido--ma...,[sin bolsa],1,1
