In [69]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile

In [70]:
# sets the output directory
cba_path = os.path.join(".", "output")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cbas'

In [71]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
translation_dict = clause_groups['Translation'].to_dict()
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_clauses(file_path):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  
    
        # extracts the types of clauses present
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        themes = []
        for line in clause_flag_end:
            if not line: 
                continue  
            title = line.split('|')[0]
            theme = theme_dict[title]
            themes.append(theme)

        # extracts the text of clauses
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        texts = []
        text = []
        for line in text_flag_start:
            if '|' in line: 
                texts.append((' ').join(text))
                text = []
            else:
                text.append(line)
        if text:
            texts.append((' ').join(text))

        return themes, texts

In [72]:
extract_clauses('cbas/2014_01_01__2014_081501.txt')

(['Contract Agreement', 'Contract Agreement', 'Other', 'Other'],
 ['Em caso de descumprimento do presente acordo, a empresa pagará multa de um piso da categoria, que será revertido em favor do empregado prejudicado.',
  'O presente acordo deverá ter uma via depositada no órgão regional do\xa0 Ministério do Trabalho, tendo validade pelo prazo de dois anos, a contar de 01/01/2014, podendo ser revogado ou prorrogado por outro acordo, conforme a conveniência das partes acordantes. E por estarem justas e acordadas as partes\xa0 firmam o presente acordo em 03 (três) vias de igual forma e teor para que produza os efeitos legais.',
  'Fica acordado que a partir de 01/01/2014 os empregados poderão trabalhar no dia de domingo que estiver de folga, mediante o pagamento de SERVIÇOS EXTRAS no valor mínimo de R$ 70,00 (setenta reais) pela jornada máxima de sete horas. ',
  'A empresa concederá uma folga semanal a\xa0 todos os seus empregados, nos dias da semana compreendida de segunda-feira a sábado

In [73]:
def output_all(file_path_x, files_x):
    # only consider files with start dates 2008-2017
    #if files_x[0:4]=='2008':
    if files_x[0:4]=='2008' or files_x[0:4]=='2009' or files_x[0:4]=='2010' or files_x[0:4]=='2011' or files_x[0:4]=='2012' \
    or files_x[0:4]=='2013' or files_x[0:4]=='2014' or files_x[0:4]=='2015' or files_x[0:4]=='2016' or files_x[0:4]=='2017':
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4])!=11:
            pass
        text = extract_clauses(os.path.join(file_path_x, files_x))
        output = contract_id + [text]
        pair_line = ('|').join(str(x) for x in output)
        # save info for contract as a single new line
        with io.open(path_txt,'a',encoding='utf8') as f:
            f.write(pair_line + "\n")

In [74]:
# rewrites output file
path_txt = os.path.join(cba_path, "clause_text.txt")
with io.open(path_txt,'w',encoding='utf8') as f:
    header = 'contract_id|text'
    f.write(header + '\n')
    
# loops over each contract
for idx, files in enumerate(os.listdir(file_path)):
    print("Looping through file ", files)
    output_all(file_path, files)

Looping through file  2016_09_01__2016_082054.txt
Looping through file  2011_11_01__2012_002993.txt
Looping through file  2014_01_01__2014_081501.txt
Looping through file  2017_12_01__2017_084835.txt
Looping through file  2017_12_01__2017_084809.txt
Looping through file  2013_11_15__2013_055346.txt
Looping through file  2009_01_01__2009_016497.txt
Looping through file  2015_06_16__2015_060659.txt
Looping through file  2018_05_01__2018_044118.txt
Looping through file  2012_05_01__2012_042451.txt
Looping through file  2011_11_01__2012_002943.txt
Looping through file  2016_09_01__2016_082084.txt
Looping through file  2013_11_14__2014_009174.txt
Looping through file  2009_01_01__2009_016731.txt
Looping through file  2015_05_01__2015_043073.txt
Looping through file  2011_11_01__2012_003082.txt
Looping through file  2015_12_16__2015_084042.txt
Looping through file  2017_12_01__2017_084934.txt
Looping through file  2013_06_01__2013_073146.txt
Looping through file  2017_03_01__2017_039221.txt
