In [59]:
import os

# sets the output directory
cba_path = os.path.join(".", "cleaned_cba_samples_small")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cba_samples_small'
# file_path = '/Users/calvineng/Dropbox/Calvin_Eng/cba_text_analysis/cba_txt_2009'

In [60]:
import pandas as pd

# theme and translation dictionaries for clause_groups
clause_groups = pd.read_csv('clause_groups.csv', index_col='name_pt')
varname_dict = clause_groups['varname'].to_dict()
varnames = list(map(str, clause_groups['varname'].unique()))

#  prints for confirmation
print(varnames)

['cl_ace_ate_med', 'cl_ace_inf_emp', 'cl_ace_sin_loc_tra', 'cl_aco_aci_por_doe_pro', 'cl_ada_fun', 'cl_adi_hor_ext', 'cl_adi_ins', 'cl_adi_not', 'cl_adi_pen_tur', 'cl_adi_per', 'cl_adi_sob', 'cl_adi_tem_ser', 'cl_aju_cus', 'cl_apl_ins_col', 'cl_apo', 'cl_ass_mor', 'cl_ass_sex', 'cl_atr_fun_des_fun', 'cl_aut_tra_nos_dom_fer', 'cl_aux_ali', 'cl_aux_cre', 'cl_aux_doe_inv', 'cl_aux_edu', 'cl_aux_hab', 'cl_aux_mat', 'cl_aux_mor_fun', 'cl_aux_sau', 'cl_aux_tra', 'cl_ava_des', 'cl_avi_pre', 'cl_cam_edu_sob_sau', 'cl_cip_com_ele_atr_gar_aos_cip', 'cl_com', 'cl_com_fab', 'cl_com_jor', 'cl_con_amb_tra', 'cl_con_jor', 'cl_con_sin', 'cl_con_tem_par', 'cl_dec_ter_sal', 'cl_des_dem', 'cl_des_ins_col', 'cl_des_sal', 'cl_des_sem', 'cl_dir_opo_des_con_sin', 'cl_dur_con_fer', 'cl_dur_hor', 'cl_emp', 'cl_equ_pro_ind', 'cl_equ_seg', 'cl_est_abo', 'cl_est_aci_por_doe_pro', 'cl_est_ado', 'cl_est_apo', 'cl_est_apr', 'cl_est_ger', 'cl_est_mae', 'cl_est_pai', 'cl_est_por_doe_nao_pro', 'cl_est_ser_mil', 'cl_exa

In [61]:
from itertools import dropwhile, takewhile
import io
import re

# retrieves the type of document
def extract_document_type(file_path):
    with io.open(file_path, 'r', encoding='utf8') as f:
        lines = (line.strip() for line in f)   
        title_start_flage = dropwhile(lambda line: '<STARTofTITLE>' not in line, lines)
        next(title_start_flage,"")
        title_end_flag = takewhile(lambda line: '<ENDofTITLE>' not in line, title_start_flage)
        title = ''.join(title_end_flag).strip()
        if 'Extrato Acordo Coletivo' in title:
            acordo, extrato = 1, 1
        elif 'Extrato Convenção Coletiva' in title:
            acordo, extrato = 0, 1
        elif 'Extrato Termo Aditivo de Acordo Coletivo' in title:
            acordo, extrato = 1, 0
        elif 'Extrato Termo Aditivo de Convenção Coletiva' in title:
            acordo, extrato = 0, 0
        else:
            acordo, extrato = '', ''

    return acordo, extrato

# retrieves the validity
def extract_validity(file_path):
    with io.open(file_path, 'r', encoding='utf8') as f:
        lines = (line.strip() for line in f) 
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = ''.join(validity_end_flag).strip()
        if 'carimbo' in validity:
            validity = 1
        elif 'semvalorlegal' in validity:
            validity = 0
        else:
            validity = ''

    return validity

# extracts the types of clauses present
def extract_clause_names(file_path):
    with io.open(file_path, 'r', encoding='utf8') as f:
        varnames = []
        lines = (line.strip() for line in f)      
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        for line in clause_flag_end:
            if not line: 
                continue
            try: 
                title = line.split('|')[0]
                varname = varname_dict[title]
            except:
                varname = ''
            varnames.append(varname)

    return varnames

# extracts the text of clauses
def extract_clause_texts(file_path):
    with io.open(file_path, 'r', encoding='utf8') as f:
        text = []
        texts = []
        lines = (line.strip() for line in f)  
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        for line in text_flag_start:
            if not line:
                continue
            elif '|' in line: 
                text.append(line.split('|')[0])
                texts.append((' ').join(text).strip())
                text = [line.split('|')[1]]
            else:
                text.append(line)
        if text:
            texts.append((' ').join(text).strip())

    return texts

In [62]:
# cleans each cluase in a document
def clean_text(text):
    text = text.replace('\xa0',' ')
    phrases = ['PARÁGRAFO ÚNICO', 'PARÁGRAFO PRIMEIRO', 'PARÁGRAFO SEGUNDO', 'PARÁGRAFO TERCEIRO', 
        'PARÁGRAFO QUARTO', 'PARÁGRAFO QUINTO', 'PARÁGRAFO SEXTO', 'PARÁGRAFO SÉTIMO',
        'PARÁGRAFO OITAVO', 'PARÁGRAFO NONO', 'PARÁGRAFO DÉCIMO']
    pattern = '|'.join(fr"{phrase}\s?[:–-]\s?" for phrase in phrases)
    text = re.sub(pattern, ' ', text, flags=re.IGNORECASE)
    text = text.replace('|',' ')
    text = text.replace('R$', ' ')
    text = text.replace('%', ' ')
    text = text.replace("'", ' ')
    text = text.replace('"', ' ')
    text = text.replace('“', ' ')
    text = text.replace('”', ' ')
    text = text.replace('·', ' ')
    text = re.sub(r'CEEE D', 'CEEE-D', text, flags=re.IGNORECASE)
    text = re.sub(r'\.\s[–-]|\.\s?[–-]', '. ', text) # '. –', '. -', '.–', '.-'
    text = re.sub(r'\(.*?\)', ' ', text) # (...)
    text = re.sub(r'[A-Z]\) [–-]', ' ', text, flags=re.IGNORECASE) # A) -
    text = re.sub(r'[A-Z]\)[–-]', ' ', text, flags=re.IGNORECASE) # A)-
    text = re.sub(r'[A-Z]\)', ' ', text, flags=re.IGNORECASE) # A)
    text = re.sub(r'[A-Z]\.\d+\)', ' ', text, flags=re.IGNORECASE) # a.1)
    text = re.sub(r'§ \d+º [–-]', ' ', text) # § 1º -
    text = text.replace('§', ' ')
    text = re.sub(r'parágrafo\s*?\d+[°º]\s*?[–-]', ' ', text, flags=re.IGNORECASE) # Parágrafo 2° - 
    text = re.sub(r'\d+[°º]\s*?trimestre\s*?[–-]', ' ', text, flags=re.IGNORECASE) # 1º trimestre –
    text = re.sub(r'\s+([.,:;?!])', r'\1', text) # spaces before punctuation
    text = re.sub('\s{2,}', ' ', text) # unessecary white spaces
    return text

In [63]:
import json

def output_all(file_path_x, files_x):
    # contract identifier
    contract_id = files_x[:-4]

    # extracts information from document
    file_path = os.path.join(file_path_x, files_x)
    acordo, extrato = extract_document_type(file_path)
    validity = extract_validity(file_path)

    if acordo == 1 and extrato == 1 and validity == 1:
        varnames = extract_clause_names(file_path)
        texts = extract_clause_texts(file_path)
        cleaned_texts = map(clean_text, texts)
        clauses_dict = [(varname, text) for varname, text in zip(varnames, cleaned_texts)]
        file_destination = os.path.join(cba_path, contract_id + '_cleaned.txt')
        with open(file_destination, 'w', encoding='utf8') as f:
            json.dump(clauses_dict, f, ensure_ascii=False)

In [64]:
from tqdm import tqdm

for file in tqdm(os.listdir(file_path)):
    if file == '.DS_Store' or file == 'Desktop.ini':
        continue
    
    try:
        output_all(file_path, file)
    except Exception as e:
        print(f"Error processing file: {file}")
        print(e)

100%|██████████| 15/15 [00:00<00:00, 507.79it/s]
