# Información de catálogo

## Parámetros

In [None]:
# Directorio base
BASE_DIR = '/content/drive/MyDrive/catalogos/PDF'
# Nombre del archivo PDF 
FILE_NAME = '2021_09.pdf'

## Entorno

### Instalación

#### Para leer PDF

In [None]:
!apt-get install poppler-utils
!pip install pdf2image

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 40 not upgraded.
Need to get 154 kB of archives.
After this operation, 613 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 poppler-utils amd64 0.62.0-2ubuntu2.12 [154 kB]
Fetched 154 kB in 0s (2,341 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 148489 files and directories currently installed.)
Preparing to unpack .../poppler-utils_0.62.0-2ubuntu2.12_amd64.deb ...
Unpacking poppler-utils (0.62.0-2ubuntu2.12) ...
Setting up poppler-utils (0.62.0-2ubuntu2.12) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Collecting pdf2image
  Downloading pdf2image

#### Para reconocer texto de imágenes

In [None]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.4-py3-none-any.whl (63.6 MB)
[K     |████████████████████████████████| 63.6 MB 8.8 kB/s 
Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: python-bidi, easyocr
Successfully installed easyocr-1.4 python-bidi-0.4.2


### Librerías

In [None]:
import re
import easyocr
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from pdf2image import convert_from_path

## Transformar

### PDF a tabla

#### Crear directorio para imágenes

In [None]:
BASE_DIR = Path(BASE_DIR)
FOLDER_NAME = ''.join(FILE_NAME.split('.')[:-1])
FOLDER_DIR = BASE_DIR.parent.joinpath('images',FOLDER_NAME)

In [None]:
try: FOLDER_DIR.mkdir(exist_ok=False)
except FileExistsError: print('noooo!')

#### Cada página = una imagen

In [None]:
pages = convert_from_path(BASE_DIR.joinpath(FILE_NAME), dpi=300) #, first_page=2, last_page=5)
for i,page in enumerate(pages):
    page.save(FOLDER_DIR.joinpath(f'{FOLDER_NAME}_page{str(i).zfill(2)}.jpg'), 'JPEG')

#### Cada imagen = 8 recuadros

In [None]:
def split_img(img, n_row, n_col, resize_to=None):
    if resize_to != None: img = img.resize((int(x*resize_to) for x in img.size))
    split_width = img.size[0]//n_col
    split_height = img.size[1]//n_row
    grid_imgs = []
    for row in range(n_row):
        for col in range(n_col):
            grid_imgs.append(img.crop((col*split_width, row*split_height, (col+1)*split_width, (row+1)*split_height)))
    return grid_imgs

#### Texto en imagen

In [None]:
reader = easyocr.Reader(['en'])

Downloading detection model, please wait. This may take several minutes depending upon your network connection.




Downloading recognition model, please wait. This may take several minutes depending upon your network connection.




#### Imagen a lista

In [None]:
image_files = [x for x in FOLDER_DIR.glob('*') if x.is_file() and str(x).split('.')[-1] in ('jpg','jpeg','png','tiff')]
image_files[:3]

[PosixPath('/content/drive/MyDrive/catalogos/images/2021_09/2021_09_page00.jpg'),
 PosixPath('/content/drive/MyDrive/catalogos/images/2021_09/2021_09_page01.jpg'),
 PosixPath('/content/drive/MyDrive/catalogos/images/2021_09/2021_09_page02.jpg')]

In [None]:
data = []
for image_path in image_files:
    image_file = Image.open(image_path)
    file_name = ''.join(''.join(str(image_path).split('/')[-1]).split('.')[:-1])
    grid = split_img(image_file, n_row=2, n_col=4, resize_to=0.44)
    for i,box_img in enumerate(grid):
        gray_image = box_img.convert('L')
        image_text = reader.readtext(np.asarray(gray_image), detail=0)
        data.append((file_name, i, image_text))

In [None]:
data = []
for image_path in image_files:
    image_file = Image.open(image_path)
    file_name = ''.join(''.join(str(image_path).split('/')[-1]).split('.')[:-1])
    grid = split_img(image_file, n_row=2, n_col=4, resize_to=0.44)
    for i,box_img in enumerate(grid):
        gray_image = box_img.convert('L')
        image_text = reader.readtext(np.asarray(gray_image), detail=0)
        data.append((file_name, i, image_text))

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


### Ingeniería

#### Lista a tabla

In [None]:
df = pd.DataFrame(data, columns=['file_name', 'n_box', 'text_list'])
df.head()

Unnamed: 0,file_name,n_box,text_list
0,2021_09_page00,0,[Be]
1,2021_09_page00,1,[tter]
2,2021_09_page00,2,"[rWZ, iDe, si, Hogar e]"
3,2021_09_page00,3,"[n Armonia, re, liciosoy, 1aceitel, Better Fry..."
4,2021_09_page00,4,"[iAHORA, Catalogos, mensuales]"


#### Separar nombre de archivo

In [None]:
df['text'] = df['text_list'].map(' '.join)
df['catalog_id'] = df['file_name'].map(lambda x: '_'.join(x.split('_')[:-1]))
df['page'] = df['file_name'].str.split('_').str[-1]
df.head()

Unnamed: 0,file_name,n_box,text_list,text,catalog_id,page
0,2021_09_page00,0,[Be],Be,2021_09,page00
1,2021_09_page00,1,[tter],tter,2021_09,page00
2,2021_09_page00,2,"[rWZ, iDe, si, Hogar e]",rWZ iDe si Hogar e,2021_09,page00
3,2021_09_page00,3,"[n Armonia, re, liciosoy, 1aceitel, Better Fry...",n Armonia re liciosoy 1aceitel Better Fryer {1...,2021_09,page00
4,2021_09_page00,4,"[iAHORA, Catalogos, mensuales]",iAHORA Catalogos mensuales,2021_09,page00


#### Buscar info de producto

In [None]:
basic_pattern = r'(?P<product_id>\d{4,})\s(?P<product_name>[a-zA-Z\s]+)(?=(?P<product_price>[sS\$]\d{2,}))[sS\$]\d{2,}(?=.*(?P<product_price_promo>[sS\$5]\d{2,}))'

In [None]:
null_dict = {'product_id':np.nan}
found_basic = pd.DataFrame([re.search(basic_pattern,x).groupdict() if re.search(basic_pattern,x)!=None else null_dict for x in df['text']], index=df.index)
df = df.join(found_basic)
df[df['product_id'].notnull()][[x for x in df.columns if x not in ['text_list','text']]].head()

Unnamed: 0,file_name,n_box,catalog_id,page,product_id,product_name,product_price,product_price_promo
8,2021_09_page01,0,2021_09,page01,21529,Cuchillos Che NUEVO,S799,$399
16,2021_09_page02,0,2021_09,page02,21725,Lampara Touc,$599,S379
24,2021_09_page03,0,2021_09,page03,21758,Asiento Memory Confort NUEVO,$699,S359
34,2021_09_page04,2,2021_09,page04,16315,Closet A Solutions A,s949,599
35,2021_09_page04,3,2021_09,page04,16316,Closet C Solutions A,s879,559


#### Ofertas

In [None]:
promo_pattern = r'Oferta(?P<product_name>[a-zA-Z\s]+)(?=(?P<product_price>[sS\$5][\d\,]*\d{2,3}))[sS\$5][\d\,]*\d{2,3}\s*(?P<product_id>\d+)\s*(?P<product_price_promo>[sS\$5][\d\,]*\d{2,3})[^\d]+\s*(?P<product_id_H>\d+)\s*(?P<product_price_H>[sS\$5][\d\,]*\d{2,3})'

In [None]:
promo_found = pd.DataFrame([re.search(promo_pattern,x).groupdict() if re.search(promo_pattern,x)!=None else null_dict for x in df['text']], index=df.index).dropna()
df.loc[promo_found.index,promo_found.columns[:-2]] = promo_found.iloc[:,:-2]
df = df.join(promo_found.iloc[:,-2:])

df['is_product_H'] = df['text'].map(lambda x: re.search('oferta', x, re.I)!=None)
df['text'] = df['text'].map(lambda x: re.sub('oferta', '', x, re.I))

df[df['product_id'].notnull()][[x for x in df.columns if x not in ['text_list','text']]].head()

Unnamed: 0,file_name,n_box,catalog_id,page,product_id,product_name,product_price,product_price_promo,product_id_H,product_price_H,is_product_H
8,2021_09_page01,0,2021_09,page01,21529,Cuchillos Che NUEVO,S799,$399,,,False
16,2021_09_page02,0,2021_09,page02,21725,Lampara Touc,$599,S379,,,False
24,2021_09_page03,0,2021_09,page03,21758,Asiento Memory Confort NUEVO,$699,S359,,,False
26,2021_09_page03,2,2021_09,page03,22942,Ventimax,"s1,499","S1,099",21733.0,s899,True
34,2021_09_page04,2,2021_09,page04,16315,Closet A Solutions A,s949,599,,,False


#### Nuevo?

In [None]:
df['is_new'] = df['text'].map(lambda x: re.search('nuevo', x, re.I)!=None)
df['product_name'] = df['product_name'].map(lambda x: re.sub('nuevo', '', str(x), flags=re.I)).str.strip()
df[df['product_id'].notnull()][[x for x in df.columns if x not in ['text_list','text']]].head()

Unnamed: 0,file_name,n_box,catalog_id,page,product_id,product_name,product_price,product_price_promo,product_id_H,product_price_H,is_product_H,is_new
8,2021_09_page01,0,2021_09,page01,21529,Cuchillos Che,S799,$399,,,False,True
16,2021_09_page02,0,2021_09,page02,21725,Lampara Touc,$599,S379,,,False,False
24,2021_09_page03,0,2021_09,page03,21758,Asiento Memory Confort,$699,S359,,,False,True
26,2021_09_page03,2,2021_09,page03,22942,Ventimax,"s1,499","S1,099",21733.0,s899,True,False
34,2021_09_page04,2,2021_09,page04,16315,Closet A Solutions A,s949,599,,,False,False


#### Corregir precios

In [None]:
df = df.dropna(subset=['product_id']).reset_index(drop=True).drop(['text_list','text'], axis=1)

In [None]:
for col in df.filter(like='price').head().columns:
    df[col] = df[col].map(lambda x: re.sub(r'[sS\$\,]','',str(x))).replace('nan',np.nan).astype(float)

correct_price_promo = []
correct_price_H = []
for x,y,z in zip(df['product_price'], df['product_price_promo'], df['product_price_H']):
    if str(x)=='nan' or str(y)=='nan' or float(y) < float(x): correct_price_promo.append(y)
    else: correct_price_promo.append(float(str(y)[1:]))
    if str(y)=='nan' or str(z)=='nan' or float(z) < float(y): correct_price_H.append(z)
    else: correct_price_H.append(float(str(z)[1:]))

df['product_price_promo'] = correct_price_promo
df['product_price_H'] = correct_price_H

df.head()

Unnamed: 0,file_name,n_box,catalog_id,page,product_id,product_name,product_price,product_price_promo,product_id_H,product_price_H,is_product_H,is_new
0,2021_09_page01,0,2021_09,page01,21529,Cuchillos Che,799.0,399.0,,,False,True
1,2021_09_page02,0,2021_09,page02,21725,Lampara Touc,599.0,379.0,,,False,False
2,2021_09_page03,0,2021_09,page03,21758,Asiento Memory Confort,699.0,359.0,,,False,True
3,2021_09_page03,2,2021_09,page03,22942,Ventimax,1499.0,1099.0,21733.0,899.0,True,False
4,2021_09_page04,2,2021_09,page04,16315,Closet A Solutions A,949.0,599.0,,,False,False


## Exportar

In [None]:
EXPORT_NAME = f"{''.join(FILE_NAME.split('.')[:-1])}.xlsx"
print(EXPORT_NAME)

2021_09.xlsx


In [None]:
export = df.sort_values(['page','n_box']).reset_index(drop=True).reset_index()
export.head()

Unnamed: 0,index,file_name,n_box,catalog_id,page,product_id,product_name,product_price,product_price_promo,product_id_H,product_price_H,is_product_H,is_new
0,0,2021_09_page01,0,2021_09,page01,21529,Cuchillos Che,799.0,399.0,,,False,True
1,1,2021_09_page02,0,2021_09,page02,21725,Lampara Touc,599.0,379.0,,,False,False
2,2,2021_09_page03,0,2021_09,page03,21758,Asiento Memory Confort,699.0,359.0,,,False,True
3,3,2021_09_page03,2,2021_09,page03,22942,Ventimax,1499.0,1099.0,21733.0,899.0,True,False
4,4,2021_09_page04,2,2021_09,page04,16315,Closet A Solutions A,949.0,599.0,,,False,False


In [None]:
export.to_excel(BASE_DIR.parent.joinpath(EXPORT_NAME), index=False)