# Limpiar barcodes

## Parámetros

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Catalog/data'
FILE_NAME = 'Barcodes Catalog MX.xlsx'

## Código

In [2]:
from numpy import nan
from pathlib import Path
from re import sub, UNICODE
from unicodedata import normalize
from pandas import ExcelFile

BASE_DIR = Path(BASE_DIR)

def clean_text(text: str, pattern: str="[^a-zA-Z0-9\s,]", lower: bool=False) -> str: 
    '''
    Limpieza de texto
    '''
    # Reemplazar acentos: áàäâã --> a
    clean = normalize('NFD', str(text).replace('\n', ' \n ')).encode('ascii', 'ignore')
    # Omitir caracteres especiales !"#$%&/()=...
    clean = sub(pattern, ' ', clean.decode('utf-8'), flags=UNICODE)
    # Mantener sólo un espacio
    clean = sub(r'\s{2,}', ' ', clean.strip())
    # Minúsculas si el parámetro lo indica
    if lower: clean = clean.lower()
    # Si el registro estaba vacío, indicar nulo
    if clean in ('','nan'): clean = nan
    return clean

## Transformar

In [7]:
data = ExcelFile(BASE_DIR.joinpath(FILE_NAME)).parse(sheet_name=0).astype(str)
data.sample()

Unnamed: 0,Currency,Product ID,Name,Barcodes
401041,MXN,4659225,Pluma ibérica,{8436561591756}


In [8]:
barcodes = data['Barcodes'].map(lambda x: clean_text(x, pattern=r'[\{\}\"\']')).str.split(',', expand=True).set_index(data.index)
barcodes.columns = [f'barcodes_{col+1}' for col in barcodes.columns]
barcodes.sample()

Unnamed: 0,barcodes_1,barcodes_2,barcodes_3,barcodes_4,barcodes_5,barcodes_6,barcodes_7,barcodes_8,barcodes_9,barcodes_10
256622,7501227200660,,,,,,,,,


In [9]:
df = data.join(barcodes)
df[df['barcodes_7'].notnull()].sample()

Unnamed: 0,Currency,Product ID,Name,Barcodes,barcodes_1,barcodes_2,barcodes_3,barcodes_4,barcodes_5,barcodes_6,barcodes_7,barcodes_8,barcodes_9,barcodes_10
71891,MXN,2013307,Marinador sabor hawaiian,"{21500048064,2000075180773,2000070895719,20000...",21500048064,2000075180773,2000070895719,2000073390273,2000070132326,2000071053866,2000071141136,2000073800420,2000073892692,2000070869185


In [13]:
df = df.melt(id_vars=['Currency', 'Product ID', 'Name', 'Barcodes'], var_name='barcode_n', value_name='barcode').dropna()
df.sample()

Unnamed: 0,Currency,Product ID,Name,Barcodes,barcode_n,barcode
137352,MXN,4502677,Caja plegable 2 cajones gris,{4516743159326},barcodes_1,4516743159326


In [28]:
df = df.pivot_table(index='barcode', values='Product ID', aggfunc=', '.join)
df.sample()

Unnamed: 0_level_0,Product ID
barcode,Unnamed: 1_level_1
8420378000259,138431


In [33]:
df = df['Product ID'].astype(str).str.split(',', expand=True)
df.columns = [f'product_id_{col+1}' for col in df.columns]
df.sample()

Unnamed: 0_level_0,product_id_1,product_id_2,product_id_3,product_id_4,product_id_5,product_id_6,product_id_7,product_id_8,product_id_9,product_id_10,product_id_11,product_id_12
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7503026576557,4563538,,,,,,,,,,,


In [41]:
df.sort_values(list(reversed(df.columns)), inplace=True)
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,barcode,product_id_1,product_id_2,product_id_3,product_id_4,product_id_5,product_id_6,product_id_7,product_id_8,product_id_9,product_id_10,product_id_11,product_id_12
0,7501055361199,4599926,4594342,4718214,4720588,4599927,4638703,4601649,4601215,4528681,4601212,4597388,336093.0
1,7501055368440,4601171,4638683,1256963,4718142,4597367,4601174,4601554,4599885,4599887,4720571,336764,
2,7501055368464,4720567,4638679,4601168,4601521,4599876,4597364,1256965,4599879,4601163,4718138,337428,


In [43]:
df['barcode'] = df['barcode'].map(lambda x: "'"+x)
df.sample()

Unnamed: 0,barcode,product_id_1,product_id_2,product_id_3,product_id_4,product_id_5,product_id_6,product_id_7,product_id_8,product_id_9,product_id_10,product_id_11,product_id_12
279098,'875160016341,331178,,,,,,,,,,,


In [45]:
new_name = ''.join(FILE_NAME.split('.')[:-1])
df.to_csv(BASE_DIR.joinpath(f'clean_{new_name}.csv'), sep='\t', encoding='utf-16', index=False)