In [1]:
import os
import json
import requests
import pandas as pd

import pytz
import time
from datetime import datetime

from dotenv import load_dotenv

In [2]:
load_dotenv()
TOKEN = os.getenv('JUSBRASIL_TOKEN')

In [3]:
def get_datetime_now(timezone="America/Sao_Paulo"):
    return datetime.now(tz=pytz.timezone("America/Sao_Paulo"))

In [4]:
def background_check(cpf, pagination=None, route_path='background_check/search', token=TOKEN):
    headers = {'Content-Type': 'application/json'}
    
    url = f'https://op.digesto.com.br/api/{route_path}'
    
    payload = {'document': f'{cpf.zfill(11)}'}
    if pagination and 'size' in pagination.keys() and 'cursor' in pagination.keys():
        payload.update(pagination)

    response = requests.request('POST', url, headers=headers, json=payload, auth=(f'{token}', ''))
    
    parsed = None
    
    if response.status_code == 200:
        parsed = json.loads(response.text)
    
    return parsed

In [5]:
def get_execution_time(parsed):
    execution_time, extensions_execution_time = -1, -1
    
    if parsed and parsed.get('extensions'):
        execution_time, extensions_execution_time = parsed.get('extensions').get('executionTime', -1), parsed.get('extensions').get('extensionsExecutionTime', -1)
        
    return execution_time, extensions_execution_time

In [6]:
def get_cursor(parsed):
    return parsed.get('data').get('root').get('crmLawsuitsByPersonId').get('pageInfo').get('endCursor')

In [7]:
def get_has_next_page(parsed):
    return parsed.get('data').get('root').get('crmLawsuitsByPersonId').get('pageInfo').get('hasNextPage')

In [8]:
def get_next_page_info(parsed, n_results_per_page = 10):
    assert(n_results_per_page <= 500)
    dict_pagination = None
    
    if parsed:
        cursor = get_cursor(parsed)

        if get_has_next_page(parsed):
            dict_pagination = {'pagination': {'size': n_results_per_page, 'cursor': cursor}}
        
    return dict_pagination

In [9]:
def normalize_driver_data(parsed):
    df = None
    if parsed:
        df = pd.json_normalize(parsed.get('data').get('root').get('crmLawsuitsByPersonId').get('edges'), sep='_', max_level=2)
    return df

In [10]:
def pipeline_search_driver_lawsuits(cpf, sleep_time_among_requests = 0.5):
    total_execution_time = 0
    total_extensions_execution_time = 0
    
    has_next_page = True
    last_cursor = None
    cursor = None
    dict_pagination = None
    
    df_all_data_driver = pd.DataFrame()

    page = 1
    while has_next_page:
        parsed = background_check(f'{cpf.zfill(11)}', pagination=dict_pagination)
        
        execution_time, extensions_execution_time = get_execution_time(parsed)
        total_execution_time += execution_time
        extensions_execution_time += extensions_execution_time
        
        df_page_data_driver = normalize_driver_data(parsed)
        
        df_page_data_driver['dt_search_for_lawsuits'] = pd.to_datetime(get_datetime_now().replace(tzinfo=None))
        df_page_data_driver['page'] = page
        df_page_data_driver['execution_time'] = execution_time
        df_page_data_driver['extensions_execution_time'] = extensions_execution_time
        
        df_all_data_driver = pd.concat([df_all_data_driver, df_page_data_driver], axis=0)

        dict_pagination = get_next_page_info(parsed, n_results_per_page=10)

        last_cursor = cursor
        cursor = get_cursor(parsed)
        has_next_page = get_has_next_page(parsed)

        if last_cursor == cursor:
            break
        
        page += 1
        time.sleep(sleep_time_among_requests)

    df_all_data_driver['total_execution_time'] = total_execution_time
    df_all_data_driver['total_extensions_execution_time'] = extensions_execution_time
    
    return df_all_data_driver.drop_duplicates()

In [11]:
def get_lawsuit_details(cnj, tipo_numero=5, route_path='tribproc/', token=TOKEN):
    headers = {'Content-Type': 'application/json'}
    
    url = f'https://op.digesto.com.br/api/{route_path}/{cnj}?tipo_numero={tipo_numero}'

    response = requests.request('GET', url, headers=headers, auth=(f'{token}', ''))
    
    parsed = None
    
    if response.status_code == 200:
        parsed = json.loads(response.text)
    
    return parsed

In [12]:
def normalize_lawsuit_data(parsed):
    df = None
    if parsed:
        df = pd.json_normalize(parsed, sep='_')
    return df

In [13]:
def pipeline_search_lawsuit_details(cnj):
    df_lawsuit_details = pd.DataFrame()
    
    lawsuit_details = get_lawsuit_details(f'{cnj.strip()}')
    if lawsuit_details:
        df_lawsuit_details = normalize_lawsuit_data(lawsuit_details)
        df_lawsuit_details['dt_search_lawsuit_details'] = pd.to_datetime(get_datetime_now().replace(tzinfo=None))
    
    return df_lawsuit_details

In [14]:
def pipeline_search_lawsuits_details(list_cnjs):
    df_lawsuits_details = pd.DataFrame()
    
    for cnj in list_cnjs:
        df_lawsuit_details = pipeline_search_lawsuit_details(cnj)
        df_lawsuits_details = pd.concat([df_lawsuits_details, df_lawsuit_details], axis=0)
    
    return df_lawsuits_details

In [21]:
cpfs = ['11291122680',
 '1301501182',
 '1454470909',
 '15030504915',
 '17112010144',
 '21482348896',
 '2280810921',
 '2387430123',
 '2428273906',
 '26200575827',
 '27977980827',
 '31467636053',
 '3207819176',
 '3215460246',
 '33003625802',
 '33989923900',
 '34199155864',
 '34339434000',
 '36452356801',
 '3749404950',
 '4063153100',
 '4294169974',
 '47522828104',
 '4981109903',
 '4996344113',
 '50375750100',
 '51119765072',
 '51460858115',
 '53392841072',
 '57156417134',
 '61967521972',
 '62515624200',
 '632083182',
 '63982790182',
 '65533151915',
 '7097699469',
 '7414637921',
 '7633237805',
 '77939271172',
 '81164700197',
 '82753946868',
 '87037459100',
 '902957066',
 '90693353104',
 '91225124115',
 '91998344053',
 '93740670053',
 '95800930830',
 '96913592',
 '98037838153']

cpfs_not_found = []
cpfs_not_found_details = []
df_all_drivers = pd.DataFrame()

start_time = time.time()
for cpf in cpfs:    
    df_driver_lawsuits = pipeline_search_driver_lawsuits(f'{cpf}')
    
    if not df_driver_lawsuits.empty:
        list_cnjs = df_driver_lawsuits['node_cnjNumber_number'].tolist()
        df_lawsuits_details = pipeline_search_lawsuits_details(list_cnjs)
        if not df_lawsuits_details.empty and 'numero' in df_lawsuits_details.columns:
            final_df = df_driver_lawsuits.merge(df_lawsuits_details, left_on='node_cnjNumber_number', right_on='numero')
            df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
        else:
            cpfs_not_found_details.append(cpf)
    else:
        cpfs_not_found.append(cpf)

end_time = (time.time() - start_time)
print("--- %s seconds ---" % end_time)

df_all_drivers['total_exp_time'] = end_time
df_all_drivers.to_excel('drivers_lawsuits_details.xlsx', index=False)

df_all_drivers.head()

  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


--- 181.5927140712738 seconds ---


Unnamed: 0,matchRelatedPeople_name,matchRelatedPeople_personId,matchRelatedPeople_role_normalized,matchRelatedPeople_role_rawValue,node_cnjNumber_number,node_court_rawValue,node_distributionDate,node_distributionType,node_district,node_forum,...,segredo_justica,arquivado,classes,acessos,uf,criadoEm,partes,dt_search_lawsuit_details,status_op,total_exp_time
0,Pedro Henrique Felipe de Faria,4994653802748477657:28,Réu,PASSIVO,1005536-37.2021.8.11.0040,TJMT,1648695600000,,CUIABÁ CRIMINAL,,...,False,False,"[Receptação, Furto]",[2021-08-02 17:16:52],MT,2021-08-02T17:15:36,"[[681294290, 63071372, DINAMILTON BARROS DA SI...",2022-09-29 17:10:44.719953,,181.592714
1,Pedro Henrique Felipe de Faria,4994653802748477657:28,Réu,RÉU,0168937-32.2021.8.13.0702,TJMG,1629774000000,Sorteio,UBERLÂNDIA/MG,,...,False,False,[],[2021-08-26 06:56:39],MG,2021-08-26T06:56:39,"[[690072348, 54929050, PEDRO HENRIQUE FELIPE D...",2022-09-29 17:10:53.603686,,181.592714
2,Pedro Henrique Felipe de Faria,4994653802748477657:28,,AD,1013493-15.2021.8.11.0000,TJMT,1627354800000,Sorteio,,,...,False,False,"[Prisão Preventiva, Furto Qualificado, Recepta...","[2022-07-22 15:10:11, 2021-08-02 22:00:21]",MT,2021-08-02T22:00:21,"[[681415741, 13629174, JUÍZO DA 2ª VARA CRIMIN...",2022-09-29 17:10:56.922760,,181.592714
0,Sabino Xavier dos Santos Junior,1464604427797647554:29,Réu,Polo Passivo,0005656-90.2022.8.16.0083,TJPR,1662692400000,,Francisco Beltrão,Vara Criminal,...,False,False,"[Citação, Apropriação indébita]",[2022-09-09 21:24:28],PR,2022-09-09T21:24:28,"[[888809049, 19094, Ministério Público Estadua...",2022-09-29 17:11:10.412587,,181.592714
1,Sabino Xavier dos Santos Junior,1464604427797647554:29,Réu,Polo Passivo,0005656-90.2022.8.16.0083,TJPR,1662692400000,,Francisco Beltrão,Vara Criminal,...,False,False,"[Citação, Apropriação indébita]",[2022-09-09 21:24:28],PR,2022-09-09T21:24:28,"[[888809049, 19094, Ministério Público Estadua...",2022-09-29 17:11:36.182503,,181.592714


In [22]:
cpfs_not_found, cpfs_not_found_details

(['1301501182',
  '15030504915',
  '17112010144',
  '21482348896',
  '2387430123',
  '2428273906',
  '26200575827',
  '31467636053',
  '3215460246',
  '33003625802',
  '33989923900',
  '34199155864',
  '36452356801',
  '3749404950',
  '4063153100',
  '4294169974',
  '47522828104',
  '50375750100',
  '51119765072',
  '51460858115',
  '632083182',
  '65533151915',
  '7097699469',
  '7414637921',
  '81164700197',
  '82753946868',
  '87037459100',
  '902957066',
  '90693353104',
  '91998344053',
  '95800930830',
  '96913592',
  '98037838153'],
 ['57156417134'])