# Aguacate quality

## Parameters

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Catalog/Search_without_results'
FILE_BASE_NAME = 'aguacate'
WORDS = 'maduro,verde,echado,perder,feo,crudo'

## Import

In [2]:
from pathlib import Path

BASE_DIR = Path(BASE_DIR)
FILE_LIST = [x for x in BASE_DIR.glob('*') if ''.join(str(x).split('/')[-1].split('_')[:-1])==FILE_BASE_NAME]
print(FILE_LIST)

[PosixPath('/Users/efraflores/Desktop/EF/Corner/Catalog/Search_without_results/aguacate_210802.csv'), PosixPath('/Users/efraflores/Desktop/EF/Corner/Catalog/Search_without_results/aguacate_210801.csv')]


In [3]:
import pandas as pd

df = pd.DataFrame()
for file_chunk in FILE_LIST:
    df = df.append(pd.read_csv(file_chunk), ignore_index=True)
df.sample()

Unnamed: 0,order_id,date,city,store_id,store,user_id,messages
3126,39083917,2021-08-03T13:57:52.466798,Aguascalientes,25,HEB,7888620,"[{""data"": ""{\""type\"": \""EVENT\"", \""content\"": ..."


## Functions

### Date variables

In [4]:
def date_vars(data, cols=['date']):
    df = data.copy()
    for col in cols:
        df[col] = pd.to_datetime(df[col], yearfirst=True)
        df[f'{col}_year'] = df[col].dt.year
        df[f'{col}_month'] = df[f'{col}_year'].astype(str)+'-'+df[col].dt.month.astype(str).str.zfill(2)
        df[f'{col}_week'] = df[f'{col}_year'].astype(str)+'-'+df[col].dt.isocalendar().week.astype(str).str.zfill(2)        
        df[f'{col}_dayname'] = df[col].dt.day_name().str[:3]
        df[f'{col}_hour'] = df[col].dt.hour.astype(str).str.zfill(2)
        df[f'{col}_hour_range'] = pd.cut(df[col].dt.hour, bins=[-1,8,12,16,20,23])
        df[f'{col}_hour_range'] = df[f'{col}_hour_range'].map(lambda x: str(x.left+1).zfill(2)+' to '+str(x.right).zfill(2))
    return df

### Expand

In [5]:
import json

def get_chat(x):
    expanded_json = pd.json_normalize(json.loads(x))
    separated_roles = expanded_json.pivot_table(columns='user.metadata.role', aggfunc={'message':'--'.join})
    correct_dict = {}
    for col in separated_roles.columns:
        try: correct_dict[col] = separated_roles.to_dict()[col]['message']
        except: pass
    return correct_dict

### Clean text

In [6]:
#Uncomment the following lines if it's the first time you run this packages
'''
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
'''
import re
import unicodedata
from emoji import demojize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

def clean_text(text, language='english', pattern="[^a-zA-Z\s]", add_stopw=[],
                lower=False, lemma=False, rem_stopw=False, unique=False, emoji=False):
    if emoji: text = demojize(text)
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n',' \n ')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [(lem.lemmatize(word,pos='v') if lemma else word) for word in 
                    (cleaned_text.lower().split() if lower else cleaned_text.split())]
    if rem_stopw: cleaned_text = [word for word in cleaned_text if word not in 
                                  stopwords.words(language)+add_stopw]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

#Ex
ex = "I am going to run!!! I ran while I was running??? ..."
print('\nOriginal:\t\t',ex)
print('Basic cleaning:\t\t',clean_text(ex))
print('Changing the pattern:\t',clean_text(ex,pattern="[^a-zA-Z!\.]"))
print('Without stopwords:\t',clean_text(ex,rem_stopw=True))
print('Lower and lemma:\t',clean_text(ex,lower=True,lemma=True))
print('Super cleaning:\t\t',clean_text(ex,add_stopw=['go'],lower=True,rem_stopw=True,lemma=True,unique=True))
print("\nIt actually corrects the weird accents, example\n\tFROM:\t ThÈ ÉfrâïsMã's?...\n\tTO:\t",clean_text("ThÈ ÉfrâïsMa's?...",lower=True))
print("\nAnd now, it can translate emojis!!! 😍",clean_text('😍', emoji=True))


Original:		 I am going to run!!! I ran while I was running??? ...
Basic cleaning:		 I am going to run I ran while I was running
Changing the pattern:	 I am going to run!!! I ran while I was running ...
Without stopwords:	 I going run I ran I running
Lower and lemma:	 i be go to run i run while i be run
Super cleaning:		 run

It actually corrects the weird accents, example
	FROM:	 ThÈ ÉfrâïsMã's?...
	TO:	 the efraisma s

And now, it can translate emojis!!! 😍 smiling face with heart eyes


### Find words

In [7]:
def find_words(x, to_find=WORDS):
    return re.findall('|'.join(map(lambda x: x.strip().lower(), to_find.split(','))),x)

### Message variables

In [8]:
import re

def var_msg(data, cols=['customer', 'shopper'], **kwargs):
    df = data.copy()
    for col in cols:
        df[f'n_msg_{col}'] = df[col].str.split('--').str.len()
        df[f'n_words_{col}'] = df[col].str.replace('--',' ').str.split().str.len()
        df[f'clean_{col}'] = df[col].map(lambda x: clean_text(str(x), **kwargs))
        df[f'found_{col}'] = df[f'clean_{col}'].map(find_words)
        df[f'n_found_{col}'] = df[f'found_{col}'].map(len)
        df.fillna({x:0 for x in df.head(1).filter(like=f'_{col}')}, inplace=True)
    df['found'] = df.filter(like='n_found_').sum(axis=1) > 0
    return df

## Transform

### Date variables

In [9]:
df = date_vars(df)
df.sample()

Unnamed: 0,order_id,date,city,store_id,store,user_id,messages,date_year,date_month,date_week,date_dayname,date_hour,date_hour_range
4365,39561681,2021-08-11 10:00:00,Ciudad de México,22,Chedraui,680686,"[{""data"": ""{\""content\"":\""Buenos días Rocío: Y...",2021,2021-08,2021-32,Wed,10,09 to 12


### Expand

In [10]:
df = df.join(pd.DataFrame(df['messages'].map(get_chat).tolist(), index=df.index)).drop('messages', axis=1)
df.sample()

Unnamed: 0,order_id,date,city,store_id,store,user_id,date_year,date_month,date_week,date_dayname,date_hour,date_hour_range,customer,shopper
3075,39072569,2021-08-03 12:00:00,Ciudad de México,22,Chedraui,263044,2021,2021-08,2021-31,Tue,12,09 to 12,,Victor Manuel is on their way with your order-...


### Messages variables

In [11]:
df = var_msg(df, pattern="[^a-zA-Z0-9\s\-]", lower=True, emoji=True)
df.sample()

Unnamed: 0,order_id,date,city,store_id,store,user_id,date_year,date_month,date_week,date_dayname,...,n_words_customer,clean_customer,found_customer,n_found_customer,n_msg_shopper,n_words_shopper,clean_shopper,found_shopper,n_found_shopper,found
2285,40807533,2021-08-30 16:42:49.958737,Ciudad de México,9,City Market,1764320,2021,2021-08,2021-35,Mon,...,74.0,esta despues del sams--el edificio es mirador ...,[],0,24,277,hola ya estoy en tu domicilio--marco antonio e...,[verde],1,True


### 

## Export

In [12]:
df.to_csv(BASE_DIR.joinpath(f'found_{FILE_BASE_NAME}.csv'), index=False, sep='\t', encoding='utf-16')