# Supply cleaning

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Catalog/Supply_cleaning/data'
FILE_NAME = 'supply_test.csv'

In [2]:
nu_pattern = r"(?P<number>\d+[\.\d]*)\s*(?P<unit>litro|lt*|mlt*|kilo(?:gramo*)*|kg|g(?:ramo*)*|miligramo*|mg|pieza|pz|onza|oz|libra|lb)s*(?:\s|_|\.|\/|$)"
units = ['litro','l','lt','ml','mlt','kilo','kilogram','kilogramo','kg','g','gram','gramo','miligram','miligramo','mg','pieza','pz','onza','oz','libra','lb']
factors = [1,1,1,0.001,0.001,1,1,1,1,0.001,0.001,0.001,0.000001,0.000001,0.000001,1,1,0.0283495,0.0283495,0.453592,0.453592]
conversion = dict(zip(units,factors))
conversion

{'litro': 1,
 'l': 1,
 'lt': 1,
 'ml': 0.001,
 'mlt': 0.001,
 'kilo': 1,
 'kilogram': 1,
 'kilogramo': 1,
 'kg': 1,
 'g': 0.001,
 'gram': 0.001,
 'gramo': 0.001,
 'miligram': 1e-06,
 'miligramo': 1e-06,
 'mg': 1e-06,
 'pieza': 1,
 'pz': 1,
 'onza': 0.0283495,
 'oz': 0.0283495,
 'libra': 0.453592,
 'lb': 0.453592}

## Import

In [3]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/cornershop/venv/lib/python3.9/site-packages')

In [4]:
import os
import pandas as pd

df = pd.read_csv(os.path.join(BASE_DIR,FILE_NAME)).set_index('supply_product_id')
backup = df.copy()
print(len(df))
display(df.sample())

1672


Unnamed: 0_level_0,catalog_product_id,name,img_url,category_id,buy_unit,barcode,package,description,brand,unit_conversion_rate,weight,sku,stock,row_num
supply_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
18492371,,Music Box Round Box Llama,http://imagenes.sanborns.com.mx/240/2003282735...,102,UN,2003282735293,,#N/D,Zhejiang Sunworld Trade Co. Ltd,,,8273529,28,1


## Functions

### Timing and tone

In [5]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

### Strip and capitalize

In [6]:
import re

def strip_capitalize(text):
    # Clean spaces at the beggining or end and makes the first letter to uppercase
    clean = str(text).strip().capitalize()
    # Just one, if there are two or more spaces
    clean = re.sub(r'\s{2,}',' ',clean)
    # Make uppercase the letter after a dot 
    clean = re.sub(r'\.\s*([a-z])',lambda word: word.group(0).upper(),clean)
    return clean

In [7]:
test = [' Esta es  Una. prueba','esta tambiÉn   .es.  una Prueba ']
[strip_capitalize(x) for x in test]

['Esta es una. Prueba', 'Esta también .Es. Una prueba']

### Preprocessing

In [8]:
def preproc(data, package_col='package', cols=['name','description']):
    df = data.copy()
    df = df[df[package_col].isnull()][cols].copy()
    for col in cols:
        df[col] = df[col].apply(strip_capitalize)
    return df

### Package

In [9]:
def quantity(data, pattern=nu_pattern, cols=['name','description']):
    df = data.copy()
    df['full'] = df[cols].apply(lambda x:'_ '.join(x.astype(str)),axis=1)
    qty = pd.DataFrame([re.search(pattern,x).groupdict() if re.search(pattern,x)!=None
                        else {'number':1,'unit':'pz'} for x in df['full']],
                       index=df.index)
    return df.join(qty).drop(columns=['full'])

### Weight

In [10]:
def weight(data, number_col='number', unit_col='unit', conv_dict=conversion):
    df = data.copy()
    df[number_col] = df[number_col].astype(float)
    df['weight'] = df[number_col]*df[unit_col].map(conv_dict)
    return df

### Brand

## Transform

- Remove blank spaces at the beginning and end (like a trim). ***Resp: Lau***
- Proper lower/upper cases. Upper case only at the beginning or after a dot. ***Resp: Lau***
- Extract package from name. ***Resp: Efra***
- Estimate weight from package. ***Resp: Efra***
- Extract brand from name if brand field is null
- Remove redundant information in name such as brand (when received also in brand field)
- Flag images with broken link: ***Resp: Matías***
- Flag SKUs sharing the same barcode
- Input unit and unit size from package (applies more to Colombia)
- Automatically suggest search terms (** desirable)
- Duplicated products not by barcode, but by their attributes

### Preprocessing

In [11]:
df = preproc(df)
df.sample(4)

Unnamed: 0_level_0,name,description
supply_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
18491724,Tall words stacking tea set of,#n/d
26107559,Solar 0hc7079 90056g58 mml4c,Luminosa y radiante forma piloto; con un armaz...
26107208,Solar mj h789-16m,Los lentes maui jim con tecnologia polarized p...
18488863,Globo jumbo verde bosque 1m,Globo verde bosque jumbo de 1m para adornar pa...


### Quantity

In [12]:
df = quantity(df)
df.sample(3)

Unnamed: 0_level_0,name,description,number,unit
supply_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35635602,Marco digital voltak dhq 8 maderach,No dejes de recordar los bellos momentos de tu...,1,pz
35634665,Audifonos freelace pro negro,#n/d,1,pz
35635354,Navidad portarretrato cat ornament,Ilumina la navidad con este bello ornamento qu...,1,pz


### Weight

In [13]:
df = weight(df)
df.sample(3)

Unnamed: 0_level_0,name,description,number,unit,weight
supply_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26107655,Solar 0hc7112 90011156 m,Forma irregular que brinda el espíritu fresco ...,1.0,pz,1.0
26104348,Anfora 5 onzas azul marino,"Ánfora de acero inoxidable azul marino, ideal ...",5.0,onza,0.141747
35635876,Flor artificial orquidea blanca 5 f,"Ramo de 5 orquídeas artificiales color blanco,...",1.0,pz,1.0


In [14]:
df[df['unit']!='pz'].head()

Unnamed: 0_level_0,name,description,number,unit,weight
supply_product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18487991,Carbon para hooka tubo c/10 piezas,"Carbón para hooka , tubo con 10 carbones cada ...",10.0,pieza,10.0
18491488,Termo acero inoxidable color gris,Termo de acero inoxidable ideal para tus bebid...,600.0,ml,0.6
18491494,Termo acero inoxidable color negro,"Termo de acero inoxidable, ideal para tus bebi...",600.0,ml,0.6
18491632,Mickey mouse tea for one red,Acompaña tu tarde de té con este set de tea fo...,470.0,ml,0.47
18491658,Laurel burch mug yellow,Agrega un toque de estilo a tu mesa con este m...,430.0,ml,0.43


### Brand

### Test

In [15]:
test = ['Piñas s&w en trozos 234 g_ 7503021632166',
        'Granola dulcerel dorada 1 kilos_ 7501485800084',
        'Cerveza brewdog punk ipa 2 piezas 330 ml',
        'Tortilla ochoa harinas trigo 12 pz',
        'Janumet t 56 50mg/1000mg',
        'Concentrado platano cubeta 4.54 kgs',
        'Semillas 250gramos',
        'Vino blanco cono sur chardonnay chi750ml',
        'Botella disney 01g9656 minnie 10 oz_ 71463096565',
        'Pintura home line clest 3.8 lt vinil_ 75012536',
        'Rosca brioche queso crema 1.12 kg pz',
        'Botella 2.2litros y 1litro',
        'Lavadora daewoo 14kg. Semiautomática dmw',
        'Bucket 11.4lb',
        'Caja de 30 libras']

## End

In [16]:
weight(pd.DataFrame([re.search(nu_pattern,x).groupdict() for x in test]))

Unnamed: 0,number,unit,weight
0,234.0,g,0.234
1,1.0,kilo,1.0
2,2.0,pieza,2.0
3,12.0,pz,12.0
4,50.0,mg,5e-05
5,4.54,kg,4.54
6,250.0,gramo,0.25
7,750.0,ml,0.75
8,10.0,oz,0.283495
9,3.8,lt,3.8


In [17]:
time_exp(time.time()-start)
tono()

0 minutos con 0.26 segundos
