# ETL of legal codes.


In [1]:
from classifier import *
import os
from support import *
from text_filters import *

import requests
import json

In [2]:
codes_folder = "../ReadFiles/Codigos"
codes_path = os.path.abspath(codes_folder)
entries = os.listdir(codes_path)

codes = []

for entry in entries:

    entry_path = os.path.join(codes_folder, entry)
    with open(entry_path, 'r', encoding='utf-8') as f:
        text = f.readlines()
        
        f.close()
    
    for line in text:
        if line == '\n':
            text.remove(line)
        if line == ' \n':
            text.remove(line)
            
    codes.append(text)

removal = codes[1][2]
removal

for code in codes:
    for line in code:
        if line == removal:
            code.remove(line)
            
for code, entry in zip(codes,entries):
    print(entry, '|', code[0])
    print(code[1:3])
    print("\n-------------------------------\n")

col_codigo_civil.txt | CÓDIGO CIVIL

['TITULO \n', 'PRELIMINAR\n']

-------------------------------

col_codigo_contencioso_administrativo.txt | Código Contencioso Administrativo

['PARTE PRIMERA\n', 'LIBRO PRIMERO\n']

-------------------------------

col_codigo_del_menor.txt | Código del Menor

['PARTE\n', '   \n']

-------------------------------

col_codigo_de_comercio.txt | Código de Comercio

['TÍTULO PRELIMINAR\n', 'DISPOSICIONES GENERALES\n']

-------------------------------

col_codigo_de_construcciones_sismo_resistentes.txt | Código de Construcciones Sismo Resistentes

['TÍTULO I\n', 'OBJETO Y ALCANCE\n']

-------------------------------

col_codigo_de_la_infancia_y_la_adolescencia.txt | Código de la Infancia y la Adolescencia

['LIBRO I\n', 'LA PROTECCION INTEGRAL\n']

-------------------------------

col_codigo_de_minas.txt | Código de Minas

['TÍTULO PRIMERO\n', 'DISPOSICIONES GENERALES\n']

-------------------------------

col_codigo_de_procedimiento_administrativo_y_de_l

In [3]:
import csv

codesid_folder = "../ReadFiles/codes_id.csv"

codesid_path = os.path.abspath(codesid_folder)
with open(codesid_path, 'r', encoding='utf-8-sig') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    codes_id = {}
    for row in csv_reader:
        codes_id[row[0]] = row[1]

    csv_file.close()

# codes_id

In [4]:
hierarchy = {
    # Agregados
    'LIBRO': 'book',
    'PARTE': 'part',
    
    # Original
    'TITULO' : 'headline',
    'DISPOSICIONES' : 'headline',
    'CAPITULO' : 'chapter',
    
    # Agregado
    'SECCION': 'section',
    
    #Original
    'ARTICULO' : 'article',
    
    # Agregados
    'CONSIDERANDO' : 'article',
    'PREAMBULO' : 'article',
}
            


In [5]:
main_list = []
for entry, code in zip(entries, codes):
    print(entry, code[0])
    
    main_section = ""
    code_info = {'main' : None,
                  'n_sections': 0,
                  'sections': set(),
                 'last_section': None,
                 }

    sections_count = {
        'book': 0,
        'part' : 0,
        'headline': 0,
        'chapter': 0,
        'section': 0,
        'article' : 0
    }

    level = {
        'book': 6,
        'part' : 5,
        'headline': 4,
        'chapter': 3,
        'section': 2,
        'article' : 1
    }

    max_level = 0
    for line in code:
        hint = first_word(line).upper()
        hint = standarize(hint)
        if hint in hierarchy and hint != 'ARTICULO':
            reference = hierarchy[hint]
            code_info['last_section'] = reference

            if level[reference] > max_level:
                code_info['main'] = reference
                max_level = level[reference]

            # This will be done for sure
            code_info['n_sections'] += 1
            code_info['sections'].add(reference)

        if hint in hierarchy:
            reference = hierarchy[hint]
            sections_count[reference] += 1

            if reference == 'article':
                sections_count[reference] += 1
                
    main_list.append(code_info['main'])
    for key, value in code_info.items():
        print(key, ':', value)

    for key, value in sections_count.items():
        print(key, ':', value)

    print("\n-------------------------------\n")


col_codigo_civil.txt CÓDIGO CIVIL

main : book
n_sections : 247
sections : {'chapter', 'book', 'headline'}
last_section : headline
book : 4
part : 0
headline : 109
chapter : 134
section : 0
article : 5686

-------------------------------

col_codigo_contencioso_administrativo.txt Código Contencioso Administrativo

main : book
n_sections : 68
sections : {'headline', 'part', 'book', 'chapter', 'section'}
last_section : headline
book : 5
part : 2
headline : 29
chapter : 30
section : 2
article : 698

-------------------------------

col_codigo_del_menor.txt Código del Menor

main : part
n_sections : 56
sections : {'chapter', 'headline', 'part', 'section'}
last_section : headline
book : 0
part : 4
headline : 24
chapter : 23
section : 5
article : 706

-------------------------------

col_codigo_de_comercio.txt Código de Comercio

main : book
n_sections : 245
sections : {'headline', 'part', 'book', 'chapter', 'section'}
last_section : headline
book : 6
part : 1
headline : 65
chapter : 122
sec

In [6]:
# main_list

In [7]:
from alive_progress import alive_bar

In [8]:
for entry, code, main in zip(entries, codes, main_list):
    art_list = None
    embed_list = None
    
    code_id = codes_id[entry]
    print('loading', code[0])
    code.pop(0)
    text = code
    art_list = articles_info(code_id, text, debugging=False)
    
    print('File:', entry,'total articles = ', len(art_list))
    
    dot_text = split_text_in_lines(text, delimiter=".")
    print('File:', entry, 'Total elements:', len(dot_text), '\n----------------------')
    
    ndot_text = text_removals(dot_text)
    dcomma_text = split_text_in_lines(ndot_text, delimiter=";")
    print('File:', entry, 'Total elements:', len(dcomma_text), '\n----------------------')
    
    embed_list = articles_info(code_id, dcomma_text, debugging=False)
    
    for embed, article in zip(embed_list, art_list):
        article['embedding'] = embed['article']['content']
        
    levels = { 'book','part', 'headline', 'chapter', 'section', 'article' }
    json_list = format_articles(art_list, headers_dict=levels, debugging=False)
    
    dict_json = json.dumps(json_list, ensure_ascii=False)
    embedding_f = f'../ReadFiles/Embeddings/{code_id}-embedding.json'
    filepath = os.path.abspath(embedding_f)


    file = open(embedding_f, "w")
    file.write(dict_json)
    file.close()
    
    log_info = {'id': None,
            'status': None,
            'error': None,
            'message': None,
            }
#     with alive_bar(len(json_list), bar = 'filling') as bar:  # declare your expected total
        
    for article in json_list:
        es_article_url = f"http://localhost:9200/teat_all/_doc/{article['id']}"
        request_response = requests.put(es_article_url, json=article)
        log_info = add_to_log(log_info, request_response, article)

        if log_info['status'][-1] != 200:
            for key, value in log_info.items():
                print(key, ':', value)
#             bar()
                
    print('Finished the upload of', entry, '\n')

loading CÓDIGO CIVIL

File: col_codigo_civil.txt total articles =  2843
File: col_codigo_civil.txt Total elements: 19129 
----------------------
Total elements: 14039 
----------------------
File: col_codigo_civil.txt Total elements: 15016 
----------------------
Finished the upload of col_codigo_civil.txt 

loading Código Contencioso Administrativo

File: col_codigo_contencioso_administrativo.txt total articles =  349
File: col_codigo_contencioso_administrativo.txt Total elements: 3953 
----------------------
Total elements: 3048 
----------------------
File: col_codigo_contencioso_administrativo.txt Total elements: 2962 
----------------------
Finished the upload of col_codigo_contencioso_administrativo.txt 

loading Código del Menor

File: col_codigo_del_menor.txt total articles =  353
File: col_codigo_del_menor.txt Total elements: 2519 
----------------------
Total elements: 1780 
----------------------
File: col_codigo_del_menor.txt Total elements: 1798 
----------------------


IndexError: list index out of range

In [None]:
local_test = "http://localhost:9200/teat_all/_search"
query_test = {
    "query": {
        "simple_query_string": {
            "query": "Articulo"
        }
    }
}
query_test = requests.get(local_test, json=query_test)

result = json.loads(query_test.text)
print(result['hits']['total']['value'])
print(result['hits']['max_score'])
best_rated = result['hits']['hits'][0]
best_rated