# Half orders

In [304]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Catalog/data'
FILE_NAME = 'text_by_order.csv'

## Import

In [305]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/cornershop/venv/lib/python3.9/site-packages')

In [306]:
import os
import pandas as pd

df = pd.read_csv(os.path.join(BASE_DIR,FILE_NAME)).set_index('order_id')
print(len(df))
display(df.sample())

18253


Unnamed: 0_level_0,custom_product,custom_request
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36126284,cuernito_ pan oreja,


## Functions

### Timing and tone

In [307]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

### Clean text

In [308]:
#Uncomment the following lines if it's the first time you run this packages
'''
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
'''
import re
import unicodedata
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

def clean_text(text,
               language='english',pattern="[^a-zA-Z' ]",
               lower=False,lemma=False,rem_stopw=False,unique=False,
               add_stopw=[]):
    #It clean and can remove stopwords or even lemmatize words if specified in params
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n','')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [(lem.lemmatize(word,pos='v') if lemma else word) for word in 
                    (cleaned_text.lower().split() if lower else cleaned_text.split())]
    if rem_stopw: cleaned_text = [word for word in cleaned_text if word.lower() not in 
                                  stopwords.words(language)+add_stopw]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

#Ex
ex = "I am going to run!!! I ran while I was running??? ..."
print('\nOriginal:\t\t',ex)
print('Basic cleaning:\t\t',clean_text(ex))
print('Changing the pattern:\t',clean_text(ex,pattern="[^a-zA-Z!\.]"))
print('Without stopwords:\t',clean_text(ex,rem_stopw=True))
print('Lower and lemma:\t',clean_text(ex,lower=True,lemma=True))
print('Super cleaning:\t\t',clean_text(ex,add_stopw=['go'],lower=True,rem_stopw=True,lemma=True,unique=True))
print("\nIt actually corrects the weird accents, example\n\tFROM:\t ThÈ ÉfrâïsMã's?...\n\tTO:\t",clean_text("ThÈ ÉfrâïsMa's?...",lower=True))


Original:		 I am going to run!!! I ran while I was running??? ...
Basic cleaning:		 I am going to run I ran while I was running
Changing the pattern:	 I am going to run!!! I ran while I was running ...
Without stopwords:	 going run ran running
Lower and lemma:	 i be go to run i run while i be run
Super cleaning:		 run

It actually corrects the weird accents, example
	FROM:	 ThÈ ÉfrâïsMã's?...
	TO:	 the efraisma's


### Multiple sub

In [309]:
import re
def multisub(dict, text):
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
  return regex.sub(lambda x: dict[x.string[x.start():x.end()]], text).strip().lower()

### Custom product

In [310]:
def found_pattern(data,text_col,pattern,request=False):
    df = data.copy()
    col = text_col.split('_')[-1]
    df[f'found_{col}'] = df[text_col].apply(lambda x: len(re.findall(pattern,str(x)))>0)*1
    if request:
        pattern = '(?:'+pattern+')[\b\w+\b\.\s]*(?=\:+)\:+([\s\b\w+\b]*)(?=\_)'
        df[f'prod_{col}'] = df[text_col].apply(lambda x: '_ '.join(re.findall(pattern,str(x))))
    return df

## Transform

### Clean text

In [311]:
omit = ['a','al','el','los','la','las','en','de','un',
        'una','uno','unos','y','con','sin','por','favor']
clean_pattern = "[^a-zA-Z0-9\/\s\:\_\.]"
for col in df.columns:
    df[f'clean_{col}'] = df[col].apply(lambda text: clean_text(text,lower=True,
                                                               add_stopw=omit,
                                                               pattern=clean_pattern)+'_ ')
df.sample(4)

Unnamed: 0_level_0,custom_product,custom_request,clean_custom_product,clean_custom_request
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35577292,cinta de empaque uso general Scotch,,cinta de empaque uso general scotch_,nan_
36621032,Don Pedro Reserva especial,amarillos sin manchas negras::Plátano Chiapas_...,don pedro reserva especial_,amarillos sin manchas negras::platano chiapas_...
36762887,chips ahoy galletas,,chips ahoy galletas_,nan_
36264409,coliflor Blanca,,coliflor blanca_,nan_


### Found patterns

In [312]:
product_pattern = r'\d+\/\d+|(?:250|500)\s*(?:gramos|gr)|\d+\.(?:5|25)\s*(?:kilo|kg)|medi[oa](?!n[ao])|cuarto|mitad'

In [313]:
pre = found_pattern(df,'clean_custom_product',product_pattern)
pre = found_pattern(pre,'clean_custom_request',product_pattern,request=True)
pre[['found_product','found_request']].mean()

found_product    0.017367
found_request    0.070673
dtype: float64

In [323]:
prod = pre[pre['found_request']==1][['custom_request','clean_custom_request','prod_request']].copy()
prod.sample(4)

Unnamed: 0_level_0,custom_request,clean_custom_request,prod_request
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
35496566,Por favor que las uvas sean sin semilla y que ...,por favor que las uvas sean sin semilla y que ...,apio
36234440,Solo 1/4kg::Chile serrano_ 1 Kg por favor::Tom...,solo 1/4kg::chile serrano_ 1 kg por favor::tom...,chile serrano
35808806,1 kg::Plátano Chiapas_ 1kg inmaduros::Aguacate...,1 kg::platano chiapas_ 1kg inmaduros::aguacate...,limon colima
35907581,1/2 melón::Melón chino_ 1/4 de sandía::Sandía ...,1/2 melon::melon chino_ 1/4 de sandia::sandia ...,melon chino_ sandia blanca rayada


In [326]:
prod.loc[35907581,'custom_request']

'1/2 melón::Melón chino_ 1/4 de sandía::Sandía blanca rayada'

In [325]:
ejemplo = ['mitad de una sandía acabada de cortar que este para mañana::Sandía blanca rayada_',
 '1/2::Papaya maradol_ 1/2::Melón chino',
 '2 kg inmaduros::Jitomate saladet_ 500 gr::Limón Colima_ 1 kg::Chayote sin espinas_ ',
 'que no esté muy verde::Plátano Chiapas_ una mitad o un cuarto que no se vea bofa o porosa::Sandía blanca rayada_ que no esté golpeada::Manzana roja mediana_']

[re.findall(f'(?:{product_pattern})[\b\w+\b\.\s]*(?=\:+)\:+([\s\b\w+\b]*)(?=\_)',x) for x in ejemplo]

[['Sandía blanca rayada'],
 ['Papaya maradol'],
 ['Limón Colima'],
 ['Sandía blanca rayada']]

In [316]:
prod['prod_request'].value_counts().to_frame().head(22)

Unnamed: 0,prod_request
papaya maradol,360
sandia blanca rayada,239
melon chino,107
aguacate hass,79
platano chiapas,67
col blanca,65
pina gota miel,33
col morada,20
jitomate saladet,18
melon chino_ papaya maradol,17


In [317]:
aux[aux['prod_request']=='']['clean_custom_request'].tolist()[3]

'que se vea fresca::lechuga escarola_ de preferencia una mitad para que pueda checar que esta buena. gracias::sandia blanca rayada_ verdes que aun no esten maduros ::aguacate hass_ mediana que esten sin manchas::calabaza italiana_ jugosas::naranja_ '

## End

In [318]:
time_exp(time.time()-start)
tono()

0 minutos con 0.59 segundos
