In [1]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/cornershop/venv/lib/python3.9/site-packages')

# Supply cleaning

### Parameters

In [2]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Catalog/Supply_cleaning/data'
FILE_NAME = 'supply_test.csv'

In [3]:
nu_pattern = r"(?P<number>\d+[\.\d]*)\s*(?P<buy_unit>litro|lt*|mlt*|kilo(?:gramo*)*|kg|g(?:ramo*)*|gr|miligramo*|mg|pieza|pza*|onza|oz|libra|lb|m(?:etro)*|meter|mt|cent[íi]metro|centimeter|cm)s*(?:\s|_|\.|\/|$)"
unit_dict = {'lt': 'LT','litro': 'LT','l': 'LT',
             'mlt': 'MLT','ml': 'MLT',
             'kg': 'KG','kilo': 'KG','kilogram': 'KG','kilogramo': 'KG',
             'gr': 'GR','g': 'GR','gram': 'GR','gramo': 'GR',
             'mg': 'MG','miligram': 'MG','miligramo': 'MG',
             'pz': 'PZ','pieza': 'PZ','pza': 'PZ',
             'oz': 'OZ','onza': 'OZ',
             'lb': 'LB','libra': 'LB',
             'mt': 'MT','m': 'MT','metro': 'MT',
             'cm': 'CM','centimeter':'CM','centimetro':'CM','centímetro': 'CM'}
units = sorted(list(set(unit_dict.values())))
units

['CM', 'GR', 'KG', 'LB', 'LT', 'MG', 'MLT', 'MT', 'OZ', 'PZ']

In [4]:
factors = [0.001,0.001,1,0.453592,1,0.000001,0.001,1,0.0283495,1]
conversion = dict(zip(units,factors))
conversion

{'CM': 0.001,
 'GR': 0.001,
 'KG': 1,
 'LB': 0.453592,
 'LT': 1,
 'MG': 1e-06,
 'MLT': 0.001,
 'MT': 1,
 'OZ': 0.0283495,
 'PZ': 1}

## Functions

### Timing and tone

In [5]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

### Strip and capitalize

In [6]:
import re

def strip_capitalize(text):
    clean = str(text).strip().capitalize()
    clean = re.sub(r'\s{2,}',' ',clean)
    clean = re.sub(r'\.\s*([a-z])',lambda word: word.group(0).upper(),clean)
    clean = re.sub(r'^nan$','',clean)
    return clean

In [7]:
test = [' Esta es  Una. prueba','esta tambiÉn   .es.  una Prueba ']
[strip_capitalize(x) for x in test]

['Esta es una. Prueba', 'Esta también .Es. Una prueba']

### Preprocessing

In [8]:
def preproc(data, cols=['package','name','description']):
    df = data.copy()
    for col in cols:
        df[col] = df[col].astype(str).apply(strip_capitalize)
    return df

### Quantity

In [9]:
def quantity(data, pattern=nu_pattern, cols=['package','name','description']):
    df = data.copy()
    df['full'] = df[cols].apply(lambda x:'_ '.join(x.astype(str)),axis=1)
    df['full'] = df['full'].apply(lambda x: re.sub(r'(\d)$',r'\1 pz',x))
    qty = pd.DataFrame([re.search(pattern,x).groupdict() if re.search(pattern,x)!=None
                        else {'number':1,'buy_unit':'UN'} for x in df['full']],
                       index=df.index)
    df = df.drop(columns=['buy_unit']).join(qty).drop(columns=['full'])
    return df

### Weight

In [10]:
def weight(data, number_col='number', unit_col='buy_unit', unit_conv=unit_dict, conv_dict=conversion):
    df = data.copy()
    df[number_col] = df[number_col].astype(float)
    df[unit_col] = df[unit_col].map(unit_conv)
    df['package'] = (df[number_col].astype(str)
                     + ' ' + 
                     df[unit_col].astype(str).str.lower())
    df['package'] = df['package'].apply(lambda x: re.sub(r'\.0\s',' ',x))
    df['weight'] = df[number_col]*df[unit_col].map(conv_dict)
    mili_dict = {'MG':'GR','MLT':'LT','CM':'MT'}
    aux = df[df[unit_col].isin(mili_dict.keys())]
    df = df[~(df[unit_col].isin(mili_dict.keys()))].copy()
    for prod in aux.index:
        div_unit = 100 if aux.loc[prod,unit_col] == 'CM' else 1000
        aux.loc[prod,number_col] = aux.loc[prod,number_col]/div_unit
    aux[unit_col] = aux[[unit_col]].replace(mili_dict)
    df = df.replace({'PZ':'UN'}).append(aux)
    return df

In [11]:
test = ['Piñas s&w en trozos 234 g',
        '1 kilo de granola dulcerel dorada',
        'Cerveza brewdog punk ipa 2 piezas',
        'Tortilla ochoa harinas trigo 12 pz',
        'Janumet de 50mg t 56',
        'Concentrado platano cubeta 4.54 onzas',
        'Semillas 250gramos',
        '2 metros de tela',
        'libreta 13cm hoja rayada',
        'cortinas 3m color blanco',
        'Vino blanco cono sur chardonnay chi750ml',
        'Botella disney 10 oz 01g9656 minnie',
        'Pintura home line clest 3.8 lt vinil',
        '1.12 kg rosca brioche queso crema',
        'Botella 2.2litros disney',
        'Bucket 11.4lb',
        'Caja de 30 libras']

import pandas as pd
aux = weight(pd.DataFrame([re.search(nu_pattern,x).groupdict() for x in test]))
aux = pd.DataFrame(test,columns=['text']).join(aux)
aux.sort_values('buy_unit')

Unnamed: 0,text,number,buy_unit,package,weight
0,Piñas s&w en trozos 234 g,234.0,GR,234 gr,0.234
4,Janumet de 50mg t 56,0.05,GR,50 mg,5e-05
6,Semillas 250gramos,250.0,GR,250 gr,0.25
1,1 kilo de granola dulcerel dorada,1.0,KG,1 kg,1.0
13,1.12 kg rosca brioche queso crema,1.12,KG,1.12 kg,1.12
15,Bucket 11.4lb,11.4,LB,11.4 lb,5.170949
16,Caja de 30 libras,30.0,LB,30 lb,13.60776
10,Vino blanco cono sur chardonnay chi750ml,0.75,LT,750 mlt,0.75
12,Pintura home line clest 3.8 lt vinil,3.8,LT,3.8 lt,3.8
14,Botella 2.2litros disney,2.2,LT,2.2 lt,2.2


### Pipeline

In [12]:
import os
import pandas as pd

def full_pipeline(base_dir,file_name,id_col='supply_product_id', package_col='package'):
    df = pd.read_csv(os.path.join(base_dir,file_name)).set_index(id_col)
    df = weight(quantity(preproc(df)))
    return df

### Brand

## Transform

- Remove blank spaces at the beginning and end (like a trim). ***Resp: Lau***
- Proper lower/upper cases. Upper case only at the beginning or after a dot. ***Resp: Lau***
- Extract package from name. ***Resp: Efra***
- Estimate weight from package. ***Resp: Efra***
- Extract brand from name if brand field is null
- Remove redundant information in name such as brand (when received also in brand field)
- Flag images with broken link: ***Resp: Matías***
- Flag SKUs sharing the same barcode
- Input unit and unit size from package (applies more to Colombia)
- Automatically suggest search terms (** desirable)
- Duplicated products not by barcode, but by their attributes

In [13]:
df = full_pipeline(BASE_DIR,FILE_NAME)
df['buy_unit'].value_counts(1)

MT    0.462642
UN    0.320388
GR    0.115661
LT    0.089911
OZ    0.010131
KG    0.001266
Name: buy_unit, dtype: float64

## End

In [15]:
time_exp(time.time()-start)
tono()

0 minutos con 1.16 segundos
