# ETL of legal codes.


In [1]:
from classifier import *
import os
from support import *
from text_filters import *

import requests
import json

In [2]:
codes_folder = "../ReadFiles/Codigos"
codes_path = os.path.abspath(codes_folder)
entries = os.listdir(codes_path)

codes = []

for entry in entries:

    entry_path = os.path.join(codes_folder, entry)
    with open(entry_path, 'r', encoding='utf-8') as f:
        text = f.readlines()
        
        f.close()
    
    for line in text:
        if line == '\n':
            text.remove(line)
        if line == ' \n':
            text.remove(line)
            
    codes.append(text)

removal = codes[1][2]
removal

for code in codes:
    for line in code:
        if line == removal:
            code.remove(line)
            
for code, entry in zip(codes,entries):
    print(entry, '|', code[0])
    print(code[1:3])
    print("\n-------------------------------\n")

col_codigo_civil.txt | CÓDIGO CIVIL

['TITULO \n', 'PRELIMINAR\n']

-------------------------------

col_codigo_contencioso_administrativo.txt | Código Contencioso Administrativo

['PARTE PRIMERA\n', 'LIBRO PRIMERO\n']

-------------------------------

col_codigo_del_menor.txt | Código del Menor

['TÍTULO PRELIMINAR\n', 'Principios generales\n']

-------------------------------

col_codigo_de_comercio.txt | Código de Comercio

['TÍTULO PRELIMINAR\n', 'DISPOSICIONES GENERALES\n']

-------------------------------

col_codigo_de_construcciones_sismo_resistentes.txt | Código de Construcciones Sismo Resistentes

['TÍTULO I\n', 'OBJETO Y ALCANCE\n']

-------------------------------

col_codigo_de_la_infancia_y_la_adolescencia.txt | Código de la Infancia y la Adolescencia

['LIBRO I\n', 'LA PROTECCION INTEGRAL\n']

-------------------------------

col_codigo_de_minas.txt | Código de Minas

['TÍTULO PRIMERO\n', 'DISPOSICIONES GENERALES\n']

-------------------------------

col_codigo_de_proced

## ID of legal sources

The ID of every legal source (codes, statutes, norms) are storage on the csv file "codes_id". The ID were made "by hand" in order to improve the human reading.

In [3]:
import csv

codesid_folder = "../ReadFiles/codes_id.csv"

codesid_path = os.path.abspath(codesid_folder)
with open(codesid_path, 'r', encoding='utf-8-sig') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    codes_id = {}
    for row in csv_reader:
        codes_id[row[0]] = row[1]

    csv_file.close()

# codes_id

## New hierarchy

This hierarchy was made according to observed on the research 

In [4]:
hierarchy = {
    # Agregados
    'LIBRO': 'book',
    'PARTE': 'part',
    
    # Original
    'TITULO' : 'headline',
    'DISPOSICIONES' : 'headline',
    'CAPITULO' : 'chapter',
    
    # Agregado
    'SECCION': 'section',
    
    #Original
    'ARTICULO' : 'article',
    
    # Agregados
    'CONSIDERANDO' : 'article',
    'PREAMBULO' : 'article',
}
            


In [5]:
main_list = []
for entry, code in zip(entries, codes):
    print(entry, code[0])
    
    main_section = ""
    code_info = {'main' : None,
                  'n_sections': 0,
                  'sections': set(),
                 'last_section': None,
                 }

    sections_count = {
        'book': 0,
        'part' : 0,
        'headline': 0,
        'chapter': 0,
        'section': 0,
        'article' : 0
    }

    level = {
        'book': 6,
        'part' : 5,
        'headline': 4,
        'chapter': 3,
        'section': 2,
        'article' : 1
    }

    max_level = 0
    for line in code:
        hint = first_word(line).upper()
        hint = standarize(hint)
        if hint in hierarchy and hint != 'ARTICULO':
            reference = hierarchy[hint]
            code_info['last_section'] = reference

            if level[reference] > max_level:
                code_info['main'] = reference
                max_level = level[reference]

            # This will be done for sure
            code_info['n_sections'] += 1
            code_info['sections'].add(reference)

        if hint in hierarchy:
            reference = hierarchy[hint]
            sections_count[reference] += 1

            if reference == 'article':
                sections_count[reference] += 1
                
    main_list.append(code_info['main'])
    for key, value in code_info.items():
        print(key, ':', value)

    for key, value in sections_count.items():
        print(key, ':', value)

    print("\n-------------------------------\n")


col_codigo_civil.txt CÓDIGO CIVIL

main : book
n_sections : 247
sections : {'chapter', 'book', 'headline'}
last_section : headline
book : 4
part : 0
headline : 109
chapter : 134
section : 0
article : 5686

-------------------------------

col_codigo_contencioso_administrativo.txt Código Contencioso Administrativo

main : book
n_sections : 68
sections : {'book', 'section', 'part', 'headline', 'chapter'}
last_section : headline
book : 5
part : 2
headline : 29
chapter : 30
section : 2
article : 698

-------------------------------

col_codigo_del_menor.txt Código del Menor

main : part
n_sections : 55
sections : {'section', 'chapter', 'part', 'headline'}
last_section : headline
book : 0
part : 3
headline : 24
chapter : 23
section : 5
article : 706

-------------------------------

col_codigo_de_comercio.txt Código de Comercio

main : book
n_sections : 245
sections : {'book', 'section', 'part', 'headline', 'chapter'}
last_section : headline
book : 6
part : 1
headline : 65
chapter : 122
sec

In [6]:
# main_list

In [7]:
# from alive_progress import alive_bar

In [8]:
c = 1
total = len(codes)


for entry, code, main in zip(entries, codes, main_list):
   
    code_id = codes_id[entry]
    code_info = {
        'id': code_id,
        'source_name': code[0],
    }
    print(f'LOADING: {c}/{total}', code[0])
    code.pop(0)
    text = code
    art_list = articles_info(code_info, text, debugging=False)
    
    print('File:', entry,'total lines = ', len(art_list))
    
    dot_text = split_text_in_lines(text, delimiter=".")
    print('File:', entry, 'Total elements by dot:', len(dot_text), '\n----------------------')
    
    ndot_text = text_removals(dot_text)
    dcomma_text = split_text_in_lines(ndot_text, delimiter=";")
    print('File:', entry, 'Total elements by dot-comma:', len(dcomma_text), '\n----------------------')
    
    dcomma_text = split_text_in_lines(ndot_text, delimiter="-")
    print('File:', entry, 'Total elements by dot-comma:', len(dcomma_text), '\n----------------------')
    
    dcomma_text = split_text_in_lines(ndot_text, delimiter="°")
    print('File:', entry, 'Total elements by dot-comma:', len(dcomma_text), '\n----------------------')
    
    embed_list = articles_info(code_info, dcomma_text, debugging=False)
    
    for embed, article in zip(embed_list, art_list):
        article['dot_comma_sep'] = embed['article']['content']
        
    levels = { 'book','part', 'headline', 'chapter', 'section', 'article' }
    json_list = format_articles(art_list, headers_dict=levels, debugging=False)
    
    dict_json = json.dumps(json_list, ensure_ascii=False)
    embedding_f = f'../ReadFiles/Embeddings/{code_id}-embedding.json'
    filepath = os.path.abspath(embedding_f)


    file = open(embedding_f, "w")
    file.write(dict_json)
    file.close()
    
    

    print(' * - * * - * * - * * - * * - * * - * * - * * - * ')
    c = c + 1

LOADING: 1/45 CÓDIGO CIVIL

File: col_codigo_civil.txt total lines =  2843
File: col_codigo_civil.txt Total elements by dot: 19129 
----------------------
Total elements: 14039 
----------------------
File: col_codigo_civil.txt Total elements by dot-comma: 15016 
----------------------
File: col_codigo_civil.txt Total elements by dot-comma: 14165 
----------------------
File: col_codigo_civil.txt Total elements by dot-comma: 14038 
----------------------
 * - * * - * * - * * - * * - * * - * * - * * - * 
LOADING: 2/45 Código Contencioso Administrativo

File: col_codigo_contencioso_administrativo.txt total lines =  349
File: col_codigo_contencioso_administrativo.txt Total elements by dot: 3953 
----------------------
Total elements: 3048 
----------------------
File: col_codigo_contencioso_administrativo.txt Total elements by dot-comma: 2962 
----------------------
File: col_codigo_contencioso_administrativo.txt Total elements by dot-comma: 2911 
----------------------
File: col_codigo_c

# Just to Load Files to ElasticSearch

In [9]:
codes_folder = "../ReadFiles/Embeddings"
codes_path = os.path.abspath(codes_folder)
toLoad_entries = os.listdir(codes_path)
e = 0
total = len(toLoad_entries)
for entry in toLoad_entries:
    log_info = {'id': None,
            'status': None,
            'error': None,
            'message': None,
            }
    e += 1
    entry_path = os.path.join(codes_folder, entry)
    
    with open(entry_path, 'r') as json_file:
        data = json.load(json_file)
        
        print(f'Uploading to Elastic Search {e}/{total}:', entry, '\n')
        for article in data:
            es_article_url = f"http://localhost:9200/teat_all/_doc/{article['id']}"
            request_response = requests.put(es_article_url, json=article)
            log_info = add_to_log(log_info, request_response, article)

            if log_info['status'][-1] != 200:
                for key, value in log_info.items():
                    print(key, ':', value)
        #             bar()

        print('Finished the upload of', entry, '\n')
        print(' * - * - * - * - * - * - * - * - * - * ')
        json_file.close()

Uploading to Elastic Search 1/45: 336transporte-embedding.json 

Finished the upload of 336transporte-embedding.json 

 * - * - * - * - * - * - * - * - * - * 
Uploading to Elastic Search 2/45: abogado-embedding.json 

Finished the upload of abogado-embedding.json 

 * - * - * - * - * - * - * - * - * - * 
Uploading to Elastic Search 3/45: ambiente-embedding.json 

Finished the upload of ambiente-embedding.json 

 * - * - * - * - * - * - * - * - * - * 
Uploading to Elastic Search 4/45: animales-embedding.json 

Finished the upload of animales-embedding.json 

 * - * - * - * - * - * - * - * - * - * 
Uploading to Elastic Search 5/45: antitramites-embedding.json 

Finished the upload of antitramites-embedding.json 

 * - * - * - * - * - * - * - * - * - * 
Uploading to Elastic Search 6/45: carad-embedding.json 

Finished the upload of carad-embedding.json 

 * - * - * - * - * - * - * - * - * - * 
Uploading to Elastic Search 7/45: carcelario-embedding.json 

Finished the upload of carcelario-

## Test of load in elastic search

In [10]:
local_test = "http://localhost:9200/teat_all/_search"
query_test = {
    "query": {
        "simple_query_string": {
            "query": "adolescencia"
        }
    }
}
query_test = requests.get(local_test, json=query_test)



In [11]:
result = json.loads(query_test.text)
print(result['hits']['total']['value'])
print(result['hits']['max_score'])
best_rated = result['hits']['hits'][1]
best_rated

259
24.824512


{'_index': 'teat_all',
 '_type': '_doc',
 '_id': 'inad01000502000220',
 '_score': 23.766186,
 '_source': {'index': 'inad',
  'legal_source': 'Código de la Infancia y la Adolescencia\n',
  'id': 'inad01000502000220',
  'book': {'title': 'LIBRO III\n',
   'name': 'SISTEMA NACIONAL DE BIENESTAR FAMILIAR, POLITICAS PUBLICAS E INSPECCION, VIGILANCIA Y CONTROL\n'},
  'section': {'title': None, 'name': None},
  'part': {'title': None, 'name': None},
  'headline': {'title': 'Disposiciones finales\n',
   'name': 'SISTEMA DE RESPONSABILIDAD PENAL PARA ADOLESCENTES Y OTRAS DISPOSICIONES\n'},
  'chapter': {'title': 'CAPITULO III\n',
   'name': 'Inspección, vigilancia y control\n'},
  'article': {'name': 'Artículo 203.',
   'content': ['Artículo 203. Principios rectores de las políticas públicas. Las políticas públicas de infancia, adolescencia y familia como políticas de Estado se regirán como mínimo por los siguientes principios:\n',
    '1. El interés superior del niño, niña o adolescente.\n',
 