In [428]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile

In [429]:
# sets the output directory
cba_path = os.path.join(".", "clause_data")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cbas'

In [430]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
translation_dict = clause_groups['Translation'].to_dict()
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_clauses(file_path, clause_type):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  
    
        # extracts the types of clauses present
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        themes = []
        titles = []
        for line in clause_flag_end:
            if not line: 
                continue  
            title = line.split('|')[0]
            translation = translation_dict[title]
            titles.append(translation)
            theme = theme_dict[title]
            themes.append(theme)

        # extracts the text of clauses
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        texts = []
        text = []
        for line in text_flag_start:
            if '|' in line: 
                text.append(line.split('|')[0])
                texts.append((' ').join(text))
                text = []
            else:
                text.append(line)
        if text:
            texts.append((' ').join(text))

        # finds all clauses of the corresponding type
        index_list = []
        for index, theme in enumerate(themes):
            if theme == clause_type:
                index_list.append(index)

        # retains clauses of proper type
        # titles_of_type = list(itemgetter(*index_list)(titles))
        # texts_of_type = list(itemgetter(*index_list)(texts))
        titles_of_type = [titles[i] for i in index_list]
        texts_of_type = [texts[i] for i in index_list]

        
        return titles_of_type, texts_of_type

In [431]:
extract_clauses('cbas/2014_01_01__2014_081501.txt', 'Contract Agreement')

(['Non-compliance with Agreement', 'Renewal / Termination of the Agreement'],
 ['Em caso de descumprimento do presente acordo, a empresa pagará multa de um piso da categoria, que será revertido em favor do empregado prejudicado. ',
  'O presente acordo deverá ter uma via depositada no órgão regional do\xa0 Ministério do Trabalho, tendo validade pelo prazo de dois anos, a contar de 01/01/2014, podendo ser revogado ou prorrogado por outro acordo, conforme a conveniência das partes acordantes. E por estarem justas e acordadas as partes\xa0 firmam o presente acordo em 03 (três) vias de igual forma e teor para que produza os efeitos legais. '])

In [432]:
def output_all(file_path_x, files_x, clause_type):
    # only consider files with start dates 2008-2017
    #if files_x[0:4]=='2008':
    if files_x[0:4]=='2008' or files_x[0:4]=='2009' or files_x[0:4]=='2010' or files_x[0:4]=='2011' or files_x[0:4]=='2012' \
    or files_x[0:4]=='2013' or files_x[0:4]=='2014' or files_x[0:4]=='2015' or files_x[0:4]=='2016' or files_x[0:4]=='2017':
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4])!=11:
            pass
        titles, texts = extract_clauses(os.path.join(file_path_x, files_x), clause_type)
        # save info for contract as a single new line
        with io.open(path_txt,'a',encoding='utf8') as f:
            for title, text in zip(titles, texts):
                output = contract_id + [title, text]
                pair_line = ('|').join(str(x) for x in output)
                f.write(pair_line + "\n")

In [433]:
for theme in themes: 
    file_name = theme.lower()
    file_name = file_name.replace(' / ', '_')
    file_name = file_name.replace(' ', '_')

    # rewrites output file
    path_txt = os.path.join(cba_path, f"{file_name}_text.csv")
    with io.open(path_txt,'w',encoding='utf8') as f:
        header = 'contract_id|title|text'
        f.write(header + '\n')

    # loops over each contract
    for idx, files in enumerate(os.listdir(file_path)):
        output_all(file_path, files, theme)

In [434]:
for theme in themes: 
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')

    df = pd.read_csv(f'clause_data/{file_name}_text.csv', sep='|')
    clean_text = []
    for i in range(len(df)):
        text = df['text'][i]
        text = text.replace('\r','')
        text = text.replace('\n','')
        text = text.replace('. ','')
        text = text.replace('.','')
        text = text.replace(', ',' ')
        text = text.replace(': ',' ')
        text = text.replace(' - ',' ')
        text = text.replace('; ',' ')
        text = text.replace(')','')
        text = text.replace('(','')
        text = text.replace('"','')
        text = text.replace('\xa0', '')
        text = text.replace(' – ', ' ')
        text = text.replace(' / ', ' ')
        text = text.replace('  ', ' ')
        text = text.replace('   ', ' ')
        text = text.replace('    ', ' ')
        text = text.replace('     ', ' ')
        text = text.replace('      ', ' ')
        clean_text.append(text.lower())
    df['clean_text'] = clean_text
    df.to_csv(f'clause_data/{file_name}_text.csv', sep='|')