In [187]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

In [188]:
# sets the output directory
cba_path = os.path.join(".", "clause_data")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cbas'

In [189]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
translation_dict = clause_groups['Translation'].to_dict()
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_clauses(file_path, clause_type):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  
    
        # extracts the types of clauses present
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        themes = []
        titles = []
        for line in clause_flag_end:
            if not line: 
                continue  
            title = line.split('|')[0]
            translation = translation_dict[title]
            titles.append(translation)
            theme = theme_dict[title]
            themes.append(theme)

        # extracts the text of clauses
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        texts = []
        text = []
        for line in text_flag_start:
            if '|' in line: 
                text.append(line.split('|')[0])
                texts.append((' ').join(text))
                text = []
            else:
                text.append(line)
        if text:
            texts.append((' ').join(text))

        # retains clauses of proper type
        indices_of_type = [i for i, theme in enumerate(themes) if theme == clause_type]
        titles_of_type = [titles[i] for i in indices_of_type]
        texts_of_type = [texts[i] for i in indices_of_type]

        return titles_of_type, texts_of_type

In [190]:
extract_clauses('cbas/2014_01_01__2014_081501.txt', 'Contract Agreement')

(['Non-compliance with Agreement', 'Renewal / Termination of the Agreement'],
 ['Em caso de descumprimento do presente acordo, a empresa pagará multa de um piso da categoria, que será revertido em favor do empregado prejudicado. ',
  'O presente acordo deverá ter uma via depositada no órgão regional do\xa0 Ministério do Trabalho, tendo validade pelo prazo de dois anos, a contar de 01/01/2014, podendo ser revogado ou prorrogado por outro acordo, conforme a conveniência das partes acordantes. E por estarem justas e acordadas as partes\xa0 firmam o presente acordo em 03 (três) vias de igual forma e teor para que produza os efeitos legais. '])

In [191]:
def output_all(file_path_x, files_x, clause_type):
    # only considers files with start dates 2008-2017
    if files_x[0:4].isdigit() and 2008 <= int(files_x[0:4]) <= 2017:
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4]) != 11:
            pass
        titles, texts = extract_clauses(os.path.join(file_path_x, files_x), clause_type)
        # saves info for contract as a single new line
        pairs = [(contract_id + [title, text]) for title, text in zip(titles, texts)]
        with io.open(path_txt, 'a', encoding='utf8') as f:
            for pair in pairs:
                pair_line = '|'.join(str(x) for x in pair)
                f.write(pair_line + '\n')

In [192]:
for theme in themes: 
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')

    # rewrites output file
    path_txt = os.path.join(cba_path, f"{file_name}_text.csv")
    with io.open(path_txt,'w',encoding='utf8') as f:
        header = 'contract_id|title|text'
        f.write(header + '\n')

    # loops over each contract
    [output_all(file_path, files, theme) for files in os.listdir(file_path)]

In [193]:
# nltk.download('stopwords')
# nltk.download('punkt')

# stop_words = set(stopwords.words('portuguese'))
# stemmer = SnowballStemmer('portuguese')
# translator = str.maketrans('', '', string.punctuation)

# def clean_text(text):
#     tokens = word_tokenize(text, language='portuguese')
#     tokens = [word for word in tokens if word.lower() not in stop_words]
#     tokens = [stemmer.stem(word) for word in tokens]
#     tokens = [word.translate(translator) for word in tokens]
#     tokens = [word for word in tokens if word.isalpha()]
#     cleaned_text = ' '.join(tokens).lower()
#     return cleaned_text

# for theme in themes:
#     file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
#     df = pd.read_csv(f'clause_data/{file_name}_text.csv', sep='|')
#     df['clean_text'] = df['text'].apply(clean_text)
#     df.to_csv(f'clause_data/{file_name}_text.csv', sep='|', index=False)

nlp = spacy.load('pt_core_news_sm')
stop_words = nlp.Defaults.stop_words

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

for theme in themes:
    # reads file as csv
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
    df = pd.read_csv(f'clause_data/{file_name}_text.csv', sep='|')

    # cleans the text and calculatues TFIDF
    df['clean_text'] = df['text'].apply(preprocess_text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['clean_text'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    df = pd.concat([df, tfidf_df], axis=1)

    # saves to csv
    df.to_csv(f'clause_data/{file_name}_text.csv', sep='|', index=False)


In [194]:
for theme in themes:
    # reads file as csv
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
    df = pd.read_csv(f'clause_data/{file_name}_text.csv', sep='|')
    
    # selects 20 tokens with highest TFIDF
    tfidf_cols = [col for col in df.columns if col not in ['text', 'clean_text']]
    tfidf_means = df.select_dtypes(include=['float64']).mean()
    top_ten = tfidf_means.nlargest(20)
    
    # prints the tokens
    print(f'Theme: {theme}')
    print(top_ten)

Theme: Wages
salário         0.068194
hora            0.060706
ser             0.057867
empregado       0.057133
dia             0.050825
pagamento       0.046515
empresa         0.045579
salarial        0.044186
parágrafo       0.039754
trabalho        0.038318
real            0.029523
pagar           0.028332
cláusula        0.026734
adiantamento    0.026616
ficar           0.025908
compensação     0.024712
reajuste        0.023766
acordo          0.023715
piso            0.022981
desconto        0.021866
dtype: float64
Theme: Health
médico          0.098278
empregado       0.082917
empresa         0.077869
ser             0.066750
exame           0.058755
odontológico    0.058425
atestado        0.057703
convênio        0.056864
profissional    0.050989
parágrafo       0.045900
caso            0.043953
serviço         0.041275
trabalho        0.040279
empregador      0.038901
sindicato       0.037419
saúde           0.035763
dia             0.034953
plano           0.034509
reconhec