# Leer el texto de una imagen en un PDF

## Parámetros

In [None]:
BASE_DIR = '/Users/efraflores/Desktop/EF/EF/Practice'
FILE_NAME = 'peritos.pdf'

## Clases

### Geolocalización

In [None]:
# Control de datos
from io import BytesIO
from typing import Dict
from zipfile import ZipFile
from requests import get as get_req

# Ingeniería de variables
from pandas import DataFrame

class GeoLoc:
    def __init__(self, iso_country_code: str='MX') -> None:
        '''
        Obtiene las coordenadas por comunidad de algún país desde <http://download.geonames.org/export/zip>
        '''
        self.country = iso_country_code
        self.zip_url = f'http://download.geonames.org/export/zip/{self.country}.zip'
        self.cols = [
            'country_code',
            'postal_code',
            'place_name',
            'state_name',
            'state_code',
            'province_name',
            'province_code',
            'community_name',
            'community_code',
            'lat',
            'lon',
            'accuracy',
        ]

    def get_geodata(self, decode_to: str='utf-8', replace_dict: Dict={'México':'Estado de México','Distrito Federal':'Ciudad de México'}) -> DataFrame:
        # Obtiene la información del request
        req_data = get_req(self.zip_url).content

        # Optimizando memoria, obtiene los datos del zip
        zipfile = ZipFile(BytesIO(req_data))

        # Lista vacía para agregar cada renglón del archivo de interés
        data = []
        # Para cada renglón del archivo txt con la información de interés
        for line in zipfile.open(f'{self.country}.txt').readlines():
            # Añadirlo a la lista ya decodificado
            data.append(line.decode(decode_to))

        # Estructurarlo en un DataFrame para manipulación posterior
        df = DataFrame(map(lambda x: x.replace('\n','').split('\t'),data), columns=self.cols)
        print(f'Códigos postales de {self.country} importados desde {self.zip_url}')

        df = df.replace(replace_dict)
        return df

### Imagen desde PDF

In [None]:
from PIL import Image
from pathlib import Path
from easyocr import Reader
from pandas import DataFrame

from re import findall
from numpy import asarray
from pdf2image import convert_from_path

class ImageFromPDF(GeoLoc):
    def __init__(self, base_dir: str, file_name:str) -> None:
        super().__init__('MX')
        self.base_dir = Path(base_dir)
        self.file_name = file_name
        self.file_path = self.base_dir.joinpath(self.file_name)
        if not self.file_path.is_file():
            print(f'There should be a file called: {self.file_name} at path:\n{self.base_dir}\n\nAdd this file and try again!\n')
        self.just_name = ''.join(self.file_name.split('.')[:-1])
        self.img_folder = self.base_dir.joinpath(f'{self.just_name}_images')
        self.img_folder.mkdir(exist_ok=True)

    def pages_to_img(self, **kwargs) -> None:
        pages = convert_from_path(self.file_path, dpi=300, **kwargs)

        self.images_list = []
        for i,page in enumerate(pages):
            img_path = self.img_folder.joinpath(f'{self.just_name}_page{str(i+1).zfill(2)}.jpg')
            page.save(img_path, 'JPEG')
            self.images_list.append(img_path)

    def read_img(self, img_path: Path, reader_obj, resize_to: float=1.0) -> list:
        img = Image.open(img_path)
        gray_img = img.convert('L').resize((int(x*resize_to) for x in img.size))
        img_text = reader_obj.readtext(asarray(gray_img), detail=0)
        return img_text

    def img_to_table(self, **kwargs) -> DataFrame:
        data = []
        for img in self.images_list:
            img_text = self.read_img(img, **kwargs)
            data.append((img, img_text))
        df = DataFrame(data, columns=['path','text_list'])
        df['text'] = df['text_list'].map(' _'.join)
        return df

    def full_pipeline(self, reader_obj) -> DataFrame:
        self.pages_to_img(first_page=2, last_page=2)
        df = self.img_to_table(reader_obj=reader_obj, resize_to=0.3)
        df['postal_code'] = df['text'].map(lambda x: ' - '.join(findall('ostal\s(\d+)', x)))
        pc = df['postal_code'].str.split(' - ', expand=True).melt(value_name='postal_code').dropna().iloc[:,-1:]
        geo = self.get_geodata().drop_duplicates('postal_code')
        pc = pc.merge(geo, how='left')
        return pc

ip = ImageFromPDF(BASE_DIR, FILE_NAME)

## Transformar

### Lector de texto en imágenes

In [None]:
reader = Reader(['es'])

### Tabla final

In [None]:
df = ip.full_pipeline(reader)
print(df.shape)
df.sample()