In [8]:
import os
import json
import requests
import pandas as pd

import pytz
import time
from datetime import datetime

from dotenv import load_dotenv

In [9]:
load_dotenv()
TOKEN = os.getenv('JUSBRASIL_TOKEN')

In [10]:
def get_datetime_now(timezone="America/Sao_Paulo"):
    return datetime.now(tz=pytz.timezone("America/Sao_Paulo"))

In [11]:
def background_check(cpf, pagination=None, route_path='background_check/search', token=TOKEN):
    headers = {'Content-Type': 'application/json'}
    
    url = f'https://op.digesto.com.br/api/{route_path}'
    
    payload = {'document': f'{cpf.zfill(11)}'}
    if pagination and 'size' in pagination.keys() and 'cursor' in pagination.keys():
        payload.update(pagination)

    response = requests.request('POST', url, headers=headers, json=payload, auth=(f'{token}', ''))
    
    parsed = None
    print(response.status_code)
    if response.status_code == 200:
        parsed = json.loads(response.text)
    
    return parsed

In [12]:
def get_execution_time(parsed):
    execution_time, extensions_execution_time = -1, -1
    
    if parsed and parsed.get('extensions'):
        execution_time, extensions_execution_time = parsed.get('extensions').get('executionTime', -1), parsed.get('extensions').get('extensionsExecutionTime', -1)
        
    return execution_time, extensions_execution_time

In [13]:
def get_cursor(parsed):
    return parsed.get('data').get('root').get('crmLawsuitsByPersonId').get('pageInfo').get('endCursor')

In [14]:
def get_has_next_page(parsed):
    return parsed.get('data').get('root').get('crmLawsuitsByPersonId').get('pageInfo').get('hasNextPage')

In [15]:
def get_next_page_info(parsed, n_results_per_page = 10):
    assert(n_results_per_page <= 500)
    dict_pagination = None
    
    if parsed:
        cursor = get_cursor(parsed)

        if get_has_next_page(parsed):
            dict_pagination = {'pagination': {'size': n_results_per_page, 'cursor': cursor}}
        
    return dict_pagination

In [16]:
def normalize_driver_data(parsed):
    df = None
    if parsed:
        df = pd.json_normalize(parsed.get('data').get('root').get('crmLawsuitsByPersonId').get('edges'), sep='_', max_level=2)
    return df

In [17]:
def pipeline_search_driver_lawsuits(cpf, sleep_time_among_requests = 0.5):
    total_execution_time = 0
    total_extensions_execution_time = 0
    
    has_next_page = True
    last_cursor = None
    cursor = None
    dict_pagination = None
    
    df_all_data_driver = pd.DataFrame()

    page = 1
    while has_next_page:
        parsed = background_check(f'{cpf.zfill(11)}', pagination=dict_pagination)
        
        execution_time, extensions_execution_time = get_execution_time(parsed)
        total_execution_time += execution_time
        extensions_execution_time += extensions_execution_time
        
        df_page_data_driver = normalize_driver_data(parsed)
        
        df_page_data_driver['dt_search_for_lawsuits'] = pd.to_datetime(get_datetime_now().replace(tzinfo=None))
        df_page_data_driver['page'] = page
        df_page_data_driver['execution_time'] = execution_time
        df_page_data_driver['extensions_execution_time'] = extensions_execution_time
        
        df_all_data_driver = pd.concat([df_all_data_driver, df_page_data_driver], axis=0)

        dict_pagination = get_next_page_info(parsed, n_results_per_page=10)

        last_cursor = cursor
        cursor = get_cursor(parsed)
        has_next_page = get_has_next_page(parsed)

        if last_cursor == cursor:
            break
        
        page += 1
        time.sleep(sleep_time_among_requests)

    df_all_data_driver['total_execution_time'] = total_execution_time
    df_all_data_driver['total_extensions_execution_time'] = extensions_execution_time
    
    return df_all_data_driver.drop_duplicates()

In [18]:
def get_lawsuit_details(cnj, tipo_numero=5, route_path='tribproc/', token=TOKEN):
    headers = {'Content-Type': 'application/json'}
    
    url = f'https://op.digesto.com.br/api/{route_path}/{cnj}?tipo_numero={tipo_numero}'

    response = requests.request('GET', url, headers=headers, auth=(f'{token}', ''))
    
    parsed = None
    
    if response.status_code == 200:
        parsed = json.loads(response.text)
    
    return parsed

In [19]:
def normalize_lawsuit_data(parsed):
    df = None
    if parsed:
        df = pd.json_normalize(parsed, sep='_')
    return df

In [20]:
def pipeline_search_lawsuit_details(cnj):
    df_lawsuit_details = pd.DataFrame()
    
    lawsuit_details = get_lawsuit_details(f'{cnj.strip()}')
    if lawsuit_details:
        df_lawsuit_details = normalize_lawsuit_data(lawsuit_details)
        df_lawsuit_details['dt_search_lawsuit_details'] = pd.to_datetime(get_datetime_now().replace(tzinfo=None))
    
    return df_lawsuit_details

In [21]:
def pipeline_search_lawsuits_details(list_cnjs):
    df_lawsuits_details = pd.DataFrame()
    
    for cnj in list_cnjs:
        df_lawsuit_details = pipeline_search_lawsuit_details(cnj)
        df_lawsuits_details = pd.concat([df_lawsuits_details, df_lawsuit_details], axis=0)
    
    return df_lawsuits_details

In [22]:
df_cpfs = pd.read_csv('data/amostra_300.csv')
df_cpfs['cod_cpf'] = df_cpfs['cod_cpf'].astype(str).str.zfill(11)

In [24]:
cpfs = df_cpfs['cod_cpf'].values

cpfs_not_found = []
cpfs_not_found_details = []
df_all_drivers = pd.DataFrame()

start_time = time.time()
for cpf in cpfs: 
    print(f'{cpf}')
    df_driver_lawsuits = pipeline_search_driver_lawsuits(f'{cpf}')
    
    if not df_driver_lawsuits.empty:
        list_cnjs = df_driver_lawsuits['node_cnjNumber_number'].tolist()
        df_lawsuits_details = pipeline_search_lawsuits_details(list_cnjs)
        if not df_lawsuits_details.empty and 'numero' in df_lawsuits_details.columns:
            final_df = df_driver_lawsuits.merge(df_lawsuits_details, left_on='node_cnjNumber_number', right_on='numero')
            df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)
        else:
            cpfs_not_found_details.append(cpf)
    else:
        cpfs_not_found.append(cpf)

end_time = (time.time() - start_time)
print("--- %s seconds ---" % end_time)

df_all_drivers['total_exp_time'] = end_time
df_all_drivers.to_excel('drivers_lawsuits_details_300.xlsx', index=False)

df_all_drivers.head()

18263429234
200
46915001549
200
36903647368
200
99443228149
200
43323650115
200
79652760110
200
03820705694
200
00710417101
200
87732920104
200
27899225191
200
74410741691
200
10365284645
200
44662580153
200
57156417134
200
17112010144
200
14141353191
200
58049720144
200
89210352904
200
41585178187
200
76730263334
200
40536572100
200
27344517191
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


15030504915
200
33989923900
200
01454470909
200
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


30441501168
200
09512878933
200
40648842134
200
01822120926
200
02180410980
200
32497369968
200
77309740963
200
61802140930
200
05776720966
200
06490629909
200
05246898903
200
04240344921
200
02714919901
200
06549687993
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


05112413999
200
45255784915
200
39089509968
200
00042952930
200
95789626904
200
02722757940
200
68026900987
200
04808435977
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


02089131900
200
83751734953
200
65533151915
200
70207097453
200
42160618268
200
38612550220
200
72095237234
200
93740670053
200
31467636053
200
38438968953
200
02884231943
200
66473330968
200
81401221904
200
27977980827
200
95800930830
200
02447278993
200
01724243802
200
09763689864
200
30022034838
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


10921809824
200
01761674803
200
71110283253
200
85774549593
200
36555378832
200
07021017537
200
02973424526
200
01286984548
200
97628905587
200
01971712574
200
07358061527
200
07614676530
200
06590807531
200
00402782550
200
03158441594
200
99307537534
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


00096913592
200
02387430123
200
02767507136
200
87652935172
200
93039239104
200
93843445168
200
02349437507
200
01272104150
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


95778829191
200
59059656172
200
04377751190
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


91225124115
200
03215460246
200
76402304204
200
04867652458
200
11291122680
200
04634742608
200
07229301602
200
04724133171
200
61773972634
200
90693353104
200
47522828104
200
63982790182
200
92287123172
200
98938193187
200
03012228195
200
02640797107
200
88493091120
200
81164700197
200
04996344113
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


98037838153
200
87037459100
200
50375750100
200
77939271172
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


34339434000
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


04063153100
200
51460858115
200
01459426193
200
70173933173
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


00105586161
200
72131136153
200
04103259108
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


01387947150
200
00486412199
200
04162106193
200
01002025192
200
01237887143
200
36177164153
200
54436958191
200
27137171087
200
00118805037
200
46794611368
200
02364343143
200
38403528191
200
00565298135
200
97486280191
200
68997981153
200
00993571190
200
72481056191
200
03103590903
200
73686107287
200
00871794152
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


99135647191
200
79586627187
200
82908290197
200
01224543246
200
03557794121
200
71932348115
200
47468378134
200
45181195920
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


01461296102
200
66697280182
200
56739362149
200
62119990115
200
73614858120
200
91704138191
200
01666521132
200
41533798168
200
01325405175
200
00992677130
200
59455225115
200
04064697170
200
87121875187
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


28739825949
200
95971246100
200
01682965198
200
04536939970
200
200
92137431149
200
48991406149
200
01036209156
200
89433386191
200
04483293109
200
01301501182
200
64538869287
200
03204646289
200
87354829404
200
07097699469
200
02354505450
200
06365178450
200
04925488465
200
03749404950
200
04294169974
200
07414637921
200
61967521972
200
77616790968
200
92693644020
200
66378273987
200
01919723986
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


50244752915
200
41926498291
200
04703056909
200
02994832909
200
88131475972
200
06851590948
200
03242350979
200
85975508215
200
09207714914
200
03439926125
200
04942968950
200
07817930929
200
06274437967
200
04286050955
200
03508629902
200
06630833970
200
06203867977
200
51872102972
200
05823095902
200
07904288907
200
02952511900
200
00973596961
200
67535208991
200
03163910939
200
82226121153
200
02383908967
200
04746211469
200
62515624200
200
02280810921
200
01669078205
200
23821078120
200
88126412291
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


28994515291
200
70485119234
200
69235783291
200
88913724200
200
00356357201
200
72062584253
200
99803216287
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


69261776272
200
86487698253
200
92062920210
200
02110846119
200
75364387291
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


00049543270
200
03006658221
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


62861409215
200
70101710259
200
41939778204
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


90953045153
200
78823641268
200
24195294215
200
00890772290
200
82562997204
200
01918676240
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


35134780200
200
63143151234
200
68366272249
200
91685451268
200
65545052291
200
00852860285
200
02771606267
200
69244022249
200
28997719220
200
03207819176
200
51119765072
200
53392841072
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


00902957066
200
91998344053
200
75910322087
200
01389854302
200
01304025080
200
03988859079
200
68439270097
200
95068627020
200
88406202068
200
02428273906
200
04981109903
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


02362544990
200
05289086935
200
10496950975
200
26096854249
200
26200575827
200
07633237805
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


33003625802
200
36452356801
200
21482348896
200
34199155864
200
82753946868
200
18168616855
200
18333862802
200
30446705861
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


76731987549
200
12362973859
200
05681384810
200
36046463831
200
30903487802
200
39409087899
200
05256707507
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


07613928829
200
19510899879
200
10503623903
200
05874669876
200
79745512834
200
34652670877
200
37645173874
200
00632083182
200
03737940371
200
03710232546
200
00562286195
200


  df_all_drivers = pd.concat([df_all_drivers, final_df], axis=0)


--- 410.6389174461365 seconds ---


ModuleNotFoundError: No module named 'openpyxl'

In [26]:
df_all_drivers['total_exp_time'] = end_time
df_all_drivers.to_excel('drivers_lawsuits_details_300.xlsx', index=False)

df_all_drivers.head()

Unnamed: 0,matchRelatedPeople_name,matchRelatedPeople_personId,matchRelatedPeople_role_normalized,matchRelatedPeople_role_rawValue,node_cnjNumber_number,node_court_rawValue,node_distributionDate,node_distributionType,node_district,node_forum,...,segredo_justica,arquivado,classes,acessos,uf,criadoEm,partes,dt_search_lawsuit_details,status_op,total_exp_time
0,Rubens da Silva Fernandes,x447043544110984535:24,Autor,AUTOR(A) DO FATO,5006597-20.2021.8.13.0035,TJMG,1633316400000,Sorteio,Araguari - Juizado Especial,,...,False,False,[DIREITO PENAL / CRIMES PRATICADOS POR PARTICU...,[2021-10-05 03:59:08],MG,2021-10-05T03:59:08,"[[707289463, 8376968, AUTORIDADE POLICIAL, AUT...",2022-10-10 11:11:25.948199,,410.638917
0,Jose Ferreira VAZ,x6998503631636920967:17,Requerido,Requerido,1000852-05.2021.8.11.0029,TJMT,1623898800000,,Canarana,,...,False,False,[Crimes de Trânsito],[2021-06-19 17:41:03],MT,2021-06-19T17:40:54,"[[666655748, 3936951, A COLETIVIDADE, A COLETI...",2022-10-10 11:11:35.850641,,410.638917
0,Sabino Xavier dos Santos Junior,1464604427797647554:29,Réu,Polo Passivo,0005656-90.2022.8.16.0083,TJPR,1662692400000,,Francisco Beltrão,Vara Criminal,...,False,False,"[Citação, Apropriação indébita]",[2022-09-09 21:24:28],PR,2022-09-09T21:24:28,"[[888809049, 19094, Ministério Público Estadua...",2022-10-10 11:11:41.122333,,410.638917
1,Sabino Xavier dos Santos Junior,1464604427797647554:29,Réu,Polo Passivo,0005656-90.2022.8.16.0083,TJPR,1662692400000,,Francisco Beltrão,Vara Criminal,...,False,False,"[Citação, Apropriação indébita]",[2022-09-09 21:24:28],PR,2022-09-09T21:24:28,"[[888809049, 19094, Ministério Público Estadua...",2022-10-10 11:11:48.185278,,410.638917
2,Sabino Xavier dos Santos Junior,1464604427797647554:29,Réu,Polo Passivo,0005656-90.2022.8.16.0083,TJPR,1662692400000,,Francisco Beltrão,Vara Criminal,...,False,False,"[Citação, Apropriação indébita]",[2022-09-09 21:24:28],PR,2022-09-09T21:24:28,"[[888809049, 19094, Ministério Público Estadua...",2022-10-10 11:11:41.122333,,410.638917


In [None]:
df_escavador = pd.read_csv('data/results_escavador.csv')

In [None]:
df_escavador = df_escavador[['cod_cpf', 'result_escavador_manual']]

In [None]:
df_escavador['cod_cpf'] = df_escavador['cod_cpf'].astype(str).str.zfill(11)
df_escavador['result'] = df_escavador['result_escavador_manual'].apply(lambda x: 1 if x == 'CRIMINAL' else 0)

In [None]:
df_escavador.groupby('result').count()

In [None]:
cpfs_encontrados_jusbrasil = set(cpfs) - set(cpfs_not_found + cpfs_not_found_details)

In [None]:
## Casos encontrados pelo escavador, mas não encontrados pela Jusbrasil

In [None]:
df_escavador[(df_escavador['result']==1)].shape[0], len(cpfs_encontrados_jusbrasil)

In [None]:
df_escavador[(df_escavador['result']==1) & (df_escavador['cod_cpf'].isin(cpfs_encontrados_jusbrasil))]['cod_cpf']

In [None]:
df_escavador[(df_escavador['result']==0) & (df_escavador['cod_cpf'].isin(cpfs_encontrados_jusbrasil))]['cod_cpf']

In [None]:
df_escavador[(df_escavador['result']==1) & (~df_escavador['cod_cpf'].isin(cpfs_encontrados_jusbrasil))]['cod_cpf']

In [None]:
cpfs_nao_encontrados_jusbrasil = df_escavador[(df_escavador['result']==1) & (~df_escavador['cod_cpf'].isin(cpfs_encontrados_jusbrasil))]['cod_cpf'].tolist()

In [None]:
for cpf in cpfs_nao_encontrados_jusbrasil:
    parsed = background_check(cpf=cpf)
    if not parsed.get('data').get('root').get('crmLawsuitsByPersonId').get('edges'):
        print(f'{cpf} - nada encontrado.')