# ¿Cuántas órdenes hay por código postal?

## Parámetros

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Requests/Orders_by_zipcode'
SHP_FILE = 'SHP_all_CS'
CSV_FILE = 'raw_MX_orders.csv'
NAMES_FILE = 'MX_zipcodes_names.csv'

## Código

In [2]:
from pathlib import Path
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

from time import time
from numpy import floor, ceil
from pandas import DataFrame, read_csv
from geopandas import GeoDataFrame, GeoSeries, read_file, points_from_xy, sjoin

class OrdersByZipcode:
    def __init__(self, base_dir: str, shp_file: str, csv_file: str, names_file: str) -> None:
        self.base_dir = Path(base_dir)
        self.shp_path = self.base_dir.joinpath(shp_file)
        self.csv_path = self.base_dir.joinpath(csv_file)
        self.names_path = self.base_dir.joinpath(names_file)

    def time_exp(self, start_time, end_time):
        x = end_time - start_time
        minutes, seconds = floor(x/60), 60*(x/60-floor(x/60))
        minutes_str = f"{int(minutes)} minutos con " if minutes > 0 else ""
        if minutes==1: minutes_str = minutes_str.replace('s','')
        return f"{minutes_str}{'{:.2f}'.format(seconds)} segundos"

    def merge_files(self, lat_col: str, lng_col: str, is_shp_folder: bool=False) -> None:
        if is_shp_folder: self.shp = read_file(self.shp_path).to_crs(crs="+proj=longlat +datum=WGS84 +no_defs")
        else: 
            geo = read_csv(self.shp_path)
            geo['geometry'] = GeoSeries.from_wkt(geo['geo'])
            self.shp = GeoDataFrame(geo, crs='EPSG:6372').drop('geo', axis=1)
            del geo
        df = read_csv(self.csv_path)
        # df = df.sample(frac=0.01)
        df = GeoDataFrame(df, crs='EPSG:6372', geometry=points_from_xy(df[lng_col], df[lat_col], crs='EPSG:6372'))
        self.df = sjoin(self.shp, df)
        del df

    def group_by_chunks(self, df: DataFrame, group_cols: list, value_cols: list, chunk_size: int=50000, verbose: bool=True) -> DataFrame:
        n_chunks = int(ceil(len(self.df)/chunk_size))
        final = DataFrame()
        n = 0
        batch_folder = self.base_dir.joinpath('batches')
        batch_folder.mkdir(exist_ok=True)
        for i in range(n_chunks):
            batch = df.loc[n:n+chunk_size-1,:].groupby(group_cols)[value_cols].count()
            final = final.append(batch, ignore_index=False)
            final.to_csv(batch_folder.joinpath(f'acum_{str(int(i+1)).zfill(2)}.csv'))
            del batch
            n += chunk_size
            if verbose: print(f'Batch {i+1}/{n_chunks} agrupado correctamente')
        final = final.reset_index().groupby(group_cols)[value_cols].sum().reset_index()
        return final

    def group_by_polygon(self, id_col: str, chunks: bool=False) -> GeoDataFrame:
        df = self.df.astype(str)
        if chunks: df = self.group_by_chunks(df, group_cols=['geometry'], value_cols=['order_id'], verbose=True)
        else: df = df.groupby('geometry')[['order_id']].count().reset_index()
        df = df.merge(self.shp.astype(str)).set_index(id_col)
        df['geometry'] = GeoSeries.from_wkt(df['geometry'])
        self.df = GeoDataFrame(df)
        del df

    def orders_heatmap(self, **kwargs) -> None:
        self.df.plot(**kwargs)
        plt.axis('off')
        plt.show()

    def full_pipeline(self, is_folder: bool=False, export_result: bool=True) -> GeoDataFrame:
        start = time()
        self.merge_files(lat_col='lat', lng_col='long', is_shp_folder=is_folder)
        print(f'Unido en {self.time_exp(start, time())}!')
        # self.group_by_polygon(id_col='zipcode')
        self.df = self.df.dissolve(by='d_cp', aggfunc='count')[['order_id']]
        self.df = self.df.reset_index().merge(self.shp[['d_cp','geometry']]).drop_duplicates('d_cp').set_index('d_cp')
        self.df = self.df.astype(str).merge(read_csv(self.names_path).astype(str), left_on='d_cp', right_on='CP', how='left')
        print(f'Agrupado en {self.time_exp(start, time())}!')
        if export_result: self.df.to_csv(self.base_dir.joinpath('orders_by_zipcode.csv'))
        print(f'Exportado en {self.time_exp(start, time())}!')
        try: 
            self.orders_heatmap(column='order_id', cmap='coolwarm', legend=False)
            print(f'Graficado en {self.time_exp(start, time())}!')
        except: pass
        return self.df


## Transformar

In [3]:
oz = OrdersByZipcode(BASE_DIR, SHP_FILE, CSV_FILE, NAMES_FILE)

In [4]:
df = oz.full_pipeline(is_folder=True)
df.sample(4)

Unido en 34.60 segundos!
Agrupado en 6 minutos con 13.39 segundos!
Exportado en 6 minutos con 13.76 segundos!


Unnamed: 0,order_id,geometry,Pais,CP,Colonia,Estado,Estado_ID,Municipio,Municipio_ID,latitud,longitud
790,110,"POLYGON ((-99.1246144618746 19.30386344354566,...",MX,14300,Residencial Miramontes,Distrito Federal,9,Tlalpan,12,19.3016,-99.1232
946,7,POLYGON ((-102.2925901477603 21.90066511422294...,MX,20030,El Sol,Aguascalientes,1,Aguascalientes,1,21.8115,-102.2957
2486,12,POLYGON ((-98.99143659672308 19.36743524728848...,MX,57940,Loma Bonita,México,15,Nezahualcóyotl,58,19.4226,-99.0227
3617,2,POLYGON ((-100.9642877832251 22.15973584603759...,MX,78306,Deportivo FFCC,San Luis Potosí,24,San Luis Potosí,28,22.1647,-100.9662
