In [105]:
import luigi

from src.utils import general as gral
from src.pipeline import ingesta_almacenamiento as ing
import src.utils.constants as cte

from datetime import date, timedelta, datetime
import pickle

In [149]:
%%file LuigiIngestionTasks.py

import luigi

from src.utils import general as gral
from src.pipeline import ingesta_almacenamiento as ing
import src.utils.constants as cte

from datetime import date, timedelta, datetime
import pickle


class IngestionTask(luigi.Task):
    
    path_cred = luigi.Parameter(default = 'credentials.yaml')
    initial = luigi.BoolParameter(default=True, parsing = luigi.BoolParameter.EXPLICIT_PARSING)
    limit = luigi.IntParameter(default = 300000)
    date = luigi.DateParameter(default = None)
                    
    def output(self):
        
        hoy = datetime.today().strftime('%Y-%m-%d')
        
        if self.initial:
#            type_insp = 'historical'
            file_name = cte.BUCKET_PATH_HIST + '{}.pkl'.format(hoy)
        else:
#            type_insp = 'consecutive'
            file_name = cte.BUCKET_PATH_CONS + '{}.pkl'.format(hoy)
                
#        file_name = 'results/food_inspections/{}/{}-inspections-{}.pkl'.\
#                     format(hoy, type_insp, hoy)
                
        return luigi.local_target.LocalTarget(file_name, format = luigi.format.Nop)
        
    def run(self):
        
        s3_c = gral.get_s3_credentials(self.path_cred)
        my_token = gral.get_api_token(self.path_cred)
        
        cliente = ing.get_client(data_url = cte.DATA_URL, token = my_token)
        
        if self.initial:
            datos = ing.ingesta_inicial(
                cliente, 
                data_set = cte.DATA_SET, 
                limit = self.limit
            )
            
        else:
            datos = ing.ingesta_consecutiva(
                client = cliente, 
                data_set = cte.DATA_SET, 
                fecha = self.date.strftime('%Y-%m-%d'), 
                limit = self.limit
            )

        with self.output().open('wb') as f:
            pickle.dump(datos, f)
        

            
if __name__ == '__main__':
    luigi.run()

Overwriting LuigiIngestionTasks.py


In [None]:
%%bash
PYTHONPATH='.' luigi \
--module src.pipeline.LuigiIngestionTasks IngestionTask \
--local-scheduler \
--path-cred ./conf/local/credentials.yaml \
--initial true \
--limit 100 \
--date '2021-03-15' 

In [None]:
%%bash
PYTHONPATH='.' luigi \
--module src.pipeline.LuigiIngestionTasks IngestionTask \
--local-scheduler \
--path-cred ./conf/local/credentials.yaml \
--initial false \
--limit 1000 \
--date '2021-01-15'

In [110]:
with open("../../results/food_inspections/2021-03-15/historical-inspections-2021-03-15.pkl", 'rb') as handle:
    bd_consecutiva = pickle.load(handle)

In [None]:
pd.DataFrame(bd_consecutiva).head()

In [112]:
with open("../../results/food_inspections/2021-03-15/consecutive-inspections-2021-03-15.pkl", 'rb') as handle:
    bd_consecutiva = pickle.load(handle)

In [None]:
pd.DataFrame(bd_consecutiva).head()

In [166]:
%%file LuigiTasks2.py

import luigi
from src.utils import general as gral
from src.pipeline import ingesta_almacenamiento as ing
from src.pipeline.LuigiIngestionTasks import IngestionTask
import src.utils.constants as cte
from datetime import date, timedelta, datetime
import pickle

class ExportFileTask(luigi.Task):
    
    path_cred = luigi.Parameter(default = 'credentials.yaml')
    initial = luigi.BoolParameter(default=True, parsing = luigi.BoolParameter.EXPLICIT_PARSING)
    limit = luigi.IntParameter(default = 300000)
    date = luigi.DateParameter(default = None)
    bucket_path = luigi.Parameter(default = 'data-product-architecture-equipo-n')
    
    # Se requiere IngestionTask
    def requires(self):
        return IngestionTask(self.path_cred, self.initial, self.limit, self.date)
    
    # Se carga el archivo a ser usado
    def input(self):
        
        if self.initial:
            file_name = cte.BUCKET_PATH_HIST + '{}.pkl'.format(self.date)
        else:
            file_name = cte.BUCKET_PATH_CONS + '{}.pkl'.format(self.date)
        
        with open(file_name, 'rb') as f:
            data = pickle.load(f)

        return data

    
    def run(self):
        
        if self.initial:
            file_type = cte.BUCKET_PATH_HIST 
        else:
            file_type = cte.BUCKET_PATH_CONS
                    
        data = self.input()
        
        ing.guardar_ingesta(
            path_cred = self.path_cred, 
            bucket = self.bucket_path, 
            bucket_path = file_type, 
            data = data
        )

    
            
if __name__ == '__main__':
    luigi.run()

Overwriting LuigiTasks2.py


In [None]:
%%bash
PYTHONPATH='.' luigi \
--module src.pipeline.LuigiTasks2 ExportFileTask \
--path-cred ./conf/local/credentials.yaml \
--initial false \
--limit 1000 \
--date '2021-0-15' \
--bucket-path 'data-product-architecture-equipo-n'
--local-scheduler \


In [None]:
%%bash
PYTHONPATH='.' luigi \
--module src.pipeline.LuigiTasks2 ExportFileTask \
--path-cred ./conf/local/credentials.yaml \
--initial true \
--limit 300000 \
--date '2021-03-15' \
--bucket-path 'data-product-architecture-equipo-n'
--local-scheduler \


In [None]:
%%bash
PYTHONPATH='.' luigi \
--module src.pipeline.LuigiTasks2 ExportFileTask \
--path-cred ./conf/local/credentials.yaml \
--initial false \
--limit 1000 \
--date '2021-03-15' \
--bucket-path 'data-product-architecture-equipo-n'

In [159]:
file_name = '../../' + cte.BUCKET_PATH_HIST + '2021-03-15.pkl'

In [160]:
with open(file_name, 'rb') as f:
            data = pickle.load(f)

In [161]:
data

[{'inspection_id': '2492713',
  'dba_name': 'VENTEUX',
  'license_': '2762644',
  'risk': 'Risk 1 (High)',
  'address': '224-230 N MICHIGAN AVE FL ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60601',
  'inspection_date': '2021-03-10T00:00:00.000',
  'inspection_type': 'License',
  'results': 'Not Ready',
  'latitude': '41.88637481521078',
  'longitude': '-87.6246754356595',
  'location': {'latitude': '-87.6246754356595',
   'longitude': '41.88637481521078'}},
 {'inspection_id': '2484866',
  'dba_name': 'POKE BAR',
  'aka_name': 'POKE BAR',
  'license_': '2595017',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '10 S LA SALLE ST ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60603',
  'inspection_date': '2021-02-17T00:00:00.000',
  'inspection_type': 'Canvass',
  'results': 'Out of Business',
  'latitude': '41.88167475683708',
  'longitude': '-87.63250418895821',
  'location': {'latitude': '-87.63250418895821',
   'longitude': '41.88167475683708'}},
 