# Half orders

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Catalog/data'
FILE_NAME = 'op_custom_request.csv'

## Import

In [2]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/cornershop/venv/lib/python3.9/site-packages')

In [3]:
import os
import pandas as pd

df = pd.read_csv(os.path.join(BASE_DIR,FILE_NAME)).set_index('order_id')
print(len(df))
display(df.sample())

208450


Unnamed: 0_level_0,product_name,custom_request
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36030402,Chayote sin espinas,


## Functions

### Timing and tone

In [4]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

### Clean text

In [5]:
#Uncomment the following lines if it's the first time you run this packages
'''
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
'''
import re
import unicodedata
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

def clean_text(text,
               language='english',pattern="[^a-zA-Z' ]",
               lower=False,lemma=False,rem_stopw=False,unique=False,
               add_stopw=[]):
    #It clean and can remove stopwords or even lemmatize words if specified in params
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n','')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [(lem.lemmatize(word,pos='v') if lemma else word) for word in 
                    (cleaned_text.lower().split() if lower else cleaned_text.split())]
    if rem_stopw: cleaned_text = [word for word in cleaned_text if word.lower() not in 
                                  stopwords.words(language)+add_stopw]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

#Ex
ex = "I am going to run!!! I ran while I was running??? ..."
print('\nOriginal:\t\t',ex)
print('Basic cleaning:\t\t',clean_text(ex))
print('Changing the pattern:\t',clean_text(ex,pattern="[^a-zA-Z!\.]"))
print('Without stopwords:\t',clean_text(ex,rem_stopw=True))
print('Lower and lemma:\t',clean_text(ex,lower=True,lemma=True))
print('Super cleaning:\t\t',clean_text(ex,add_stopw=['go'],lower=True,rem_stopw=True,lemma=True,unique=True))
print("\nIt actually corrects the weird accents, example\n\tFROM:\t ThÈ ÉfrâïsMã's?...\n\tTO:\t",clean_text("ThÈ ÉfrâïsMa's?...",lower=True))


Original:		 I am going to run!!! I ran while I was running??? ...
Basic cleaning:		 I am going to run I ran while I was running
Changing the pattern:	 I am going to run!!! I ran while I was running ...
Without stopwords:	 going run ran running
Lower and lemma:	 i be go to run i run while i be run
Super cleaning:		 run

It actually corrects the weird accents, example
	FROM:	 ThÈ ÉfrâïsMã's?...
	TO:	 the efraisma's


### Multiple sub

In [6]:
import re
def multisub(dict, text):
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
  return regex.sub(lambda x: dict[x.string[x.start():x.end()]], text).strip().lower()

### Custom product

In [7]:
def found_pattern(data,text_col,pattern,request=False):
    df = data.copy()
    col = text_col.split('_')[-1]
    df[f'found_{col}'] = df[text_col].apply(lambda x: len(re.findall(pattern,str(x)))>0)*1
    if request:
        pattern = '(?:'+pattern+')[\b\w+\b\.\s]*(?=\:+)\:+([\s\b\w+\b]*)(?=\_)'
        df[f'prod_{col}'] = df[text_col].apply(lambda x: '_ '.join(re.findall(pattern,str(x))))
    return df

## Transform

### Union

In [8]:
op = df.copy()

In [9]:
df = df[df['custom_request'].notnull()].copy()

In [10]:
df['custom_request'] = df['custom_request']+'::'+df['product_name']+'_ '
df = df[['custom_request']].copy()
df.sample()

Unnamed: 0_level_0,custom_request
order_id,Unnamed: 1_level_1
36784856,Jugosos y blandos::Limón Colima_


### Clean text

In [11]:
omit = ['a','al','el','los','la','las','en','de','un',
        'una','uno','unos','y','con','sin','por','favor']
clean_pattern = "[^a-zA-Z0-9\/\s\:\_\.]"

In [12]:
df[f'clean_custom_request'] = df['custom_request'].apply(lambda text: clean_text(text,lower=True,
                                                                               add_stopw=omit,
                                                                               pattern=clean_pattern))
df.sample(4)

Unnamed: 0_level_0,custom_request,clean_custom_request
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36337918,Amarillos::Plátano Chiapas_,amarillos::platano chiapas_
36309451,1 pieza::Cebolla morada_,1 pieza::cebolla morada_
36561992,Medianos::Nopal_,medianos::nopal_
36180411,papas grandes::Papa blanca_,papas grandes::papa blanca_


### Found patterns

In [13]:
product_pattern = r'\d+\/\d+|(?:250|500)\s*(?:gramos|gr)|\d+\.(?:5|25)\s*(?:kilo|kg)|medi[oa](?!n[ao])|cuarto|mitad'

In [14]:
pre = found_pattern(df,'clean_custom_request',product_pattern,request=True)
pre[['found_request']].mean()

found_request    0.077441
dtype: float64

In [15]:
prod = pre[pre['found_request']==1][['custom_request','prod_request']].copy()
prod.sample(4)

Unnamed: 0_level_0,custom_request,prod_request
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36683373,"Una pieza, la más pequeña y si hay muy grande ...",pina gota miel
36320311,De preferencia una pieza pequeña 1/4 o 1/2 de ...,col blanca
36332693,un cuarto de sandia::Sandía blanca rayada_,sandia blanca rayada
36810379,1/4 de sandía::Sandía blanca rayada_,sandia blanca rayada


In [16]:
ejemplo = ['mitad de una sandía acabada de cortar que este para mañana::Sandía blanca rayada_',
 '1/2::Papaya maradol_ 1/2::Melon chino_ ',
 '2 kg inmaduros::Jitomate saladet_ 500 gr::Limón Colima_ 1 kg::Chayote sin espinas_ ',
 'que no esté muy verde::Plátano Chiapas_ una mitad o un cuarto que no se vea bofa o porosa::Sandía blanca rayada_ que no esté golpeada::Manzana roja mediana_']

[re.findall(f'(?:{product_pattern})[\b\w+\b\.\s]*(?=\:+)\:+([\s\b\w+\b]*)(?=\_)',x) for x in ejemplo]

[['Sandía blanca rayada'],
 ['Papaya maradol', 'Melon chino'],
 ['Limón Colima'],
 ['Sandía blanca rayada']]

In [17]:
total = prod['prod_request'].str.split('_',expand=True).melt(ignore_index=False).dropna().iloc[:,1:]
total['value'] = total['value'].str.split().str[0]
top = total['value'].value_counts().to_frame().head(15)
top

Unnamed: 0,value
papaya,313
sandia,196
melon,136
platano,70
col,69
aguacate,68
pina,44
jitomate,24
mango,10
limon,9


In [18]:
len(df),len(total),len(total)/len(df)

(13081, 1013, 0.0774405626481156)

### From total

In [19]:
op.sample()

Unnamed: 0_level_0,product_name,custom_request
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36223880,Calabaza italiana,


In [20]:
cat = op['product_name'].str.split().str[0].apply(clean_text).str.lower().to_frame()
cat.sample(3)

Unnamed: 0_level_0,product_name
order_id,Unnamed: 1_level_1
36542227,limon
36739939,cebolla
36176960,bits


In [21]:
compara = top.join(cat['product_name'].value_counts().to_frame())
compara['perc'] = compara['value']/(compara['product_name']+1e-10)

## End

In [22]:
compara.rename(columns={'value':'solicitud','product_name':'total_orders'}).sort_values('perc',0,0).style.format(formatter={'perc': "{:.1%}",})

Unnamed: 0,solicitud,total_orders,perc
sandia,196,765,25.6%
col,69,420,16.4%
papaya,313,3735,8.4%
melon,136,1996,6.8%
pina,44,3017,1.5%
aguacate,68,10197,0.7%
platano,70,12154,0.6%
ejote,4,1401,0.3%
mango,10,5422,0.2%
jitomate,24,13480,0.2%


In [23]:
time_exp(time.time()-start)
tono()

0 minutos con 4.20 segundos
