In [67]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [68]:
# sets the output directory
cba_path = os.path.join(".", "clause_data")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cbas'

In [69]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
translation_dict = clause_groups['Translation'].to_dict()
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_clauses(file_path, clause_type):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  
    
        # extracts the types of clauses present
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        themes = []
        titles = []
        for line in clause_flag_end:
            if not line: 
                continue  
            title = line.split('|')[0]
            translation = translation_dict[title]
            titles.append(translation)
            theme = theme_dict[title]
            themes.append(theme)

        # extracts the text of clauses
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        texts = []
        text = []
        for line in text_flag_start:
            if '|' in line: 
                text.append(line.split('|')[0])
                texts.append((' ').join(text))
                text = []
            else:
                text.append(line)
        if text:
            texts.append((' ').join(text))

        # finds all clauses of the corresponding type
        index_list = []
        for index, theme in enumerate(themes):
            if theme == clause_type:
                index_list.append(index)

        # retains clauses of proper type
        # titles_of_type = list(itemgetter(*index_list)(titles))
        # texts_of_type = list(itemgetter(*index_list)(texts))
        titles_of_type = [titles[i] for i in index_list]
        texts_of_type = [texts[i] for i in index_list]

        
        return titles_of_type, texts_of_type

In [70]:
extract_clauses('cbas/2014_01_01__2014_081501.txt', 'Contract Agreement')

(['Non-compliance with Agreement', 'Renewal / Termination of the Agreement'],
 ['Em caso de descumprimento do presente acordo, a empresa pagará multa de um piso da categoria, que será revertido em favor do empregado prejudicado. ',
  'O presente acordo deverá ter uma via depositada no órgão regional do\xa0 Ministério do Trabalho, tendo validade pelo prazo de dois anos, a contar de 01/01/2014, podendo ser revogado ou prorrogado por outro acordo, conforme a conveniência das partes acordantes. E por estarem justas e acordadas as partes\xa0 firmam o presente acordo em 03 (três) vias de igual forma e teor para que produza os efeitos legais. '])

In [71]:
def output_all(file_path_x, files_x, clause_type):
    # only consider files with start dates 2008-2017
    #if files_x[0:4]=='2008':
    if files_x[0:4]=='2008' or files_x[0:4]=='2009' or files_x[0:4]=='2010' or files_x[0:4]=='2011' or files_x[0:4]=='2012' \
    or files_x[0:4]=='2013' or files_x[0:4]=='2014' or files_x[0:4]=='2015' or files_x[0:4]=='2016' or files_x[0:4]=='2017':
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4])!=11:
            pass
        titles, texts = extract_clauses(os.path.join(file_path_x, files_x), clause_type)
        # save info for contract as a single new line
        with io.open(path_txt,'a',encoding='utf8') as f:
            for title, text in zip(titles, texts):
                output = contract_id + [title, text]
                pair_line = ('|').join(str(x) for x in output)
                f.write(pair_line + "\n")

In [72]:
for theme in themes: 
    file_name = theme.lower()
    file_name = file_name.replace(' / ', '_')
    file_name = file_name.replace(' ', '_')

    # rewrites output file
    path_txt = os.path.join(cba_path, f"{file_name}_text.csv")
    with io.open(path_txt,'w',encoding='utf8') as f:
        header = 'contract_id|title|text'
        f.write(header + '\n')

    # loops over each contract
    for idx, files in enumerate(os.listdir(file_path)):
        output_all(file_path, files, theme)

In [73]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('portuguese'))
stemmer = SnowballStemmer('portuguese')
translator = str.maketrans('', '', string.punctuation)

def clean_text(text):
    tokens = word_tokenize(text, language='portuguese')
    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [word.translate(translator) for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    cleaned_text = ' '.join(tokens).lower()
    return cleaned_text

for theme in themes:
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
    df = pd.read_csv(f'clause_data/{file_name}_text.csv', sep='|')
    df['clean_text'] = df['text'].apply(clean_text)
    df.to_csv(f'clause_data/{file_name}_text.csv', sep='|', index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/calvineng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/calvineng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [74]:
for theme in themes:
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
    df = pd.read_csv(f'clause_data/{file_name}_text.csv', sep='|')
    
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df['clean_text'])
    tfidf_matrix = vectorizer.transform(df['clean_text'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    df = pd.concat([df, tfidf_df], axis=1)
    
    df.to_csv(f'clause_data/{file_name}_tfidf.csv', sep='|', index=False)


In [75]:
for theme in themes:
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
    df = pd.read_csv(f'clause_data/{file_name}_tfidf.csv', sep='|')
    
    tfidf_cols = [col for col in df.columns if col not in ['text', 'clean_text']]
    tfidf_means = df.select_dtypes(include=['float64']).mean()
    top_ten = tfidf_means.sort_values(ascending=False)[:10]
    
    print(f'Theme: {theme}')
    print(top_ten)

Theme: Wages
empreg      0.070080
salári      0.067006
hor         0.063453
trabalh     0.058312
empres      0.045171
pagament    0.044556
dia         0.037862
parágraf    0.037588
compens     0.034522
cent        0.033940
dtype: float64
Theme: Health
empreg       0.103936
médic        0.093047
empres       0.080547
atest        0.071982
exam         0.056712
odontológ    0.055023
convêni      0.054041
trabalh      0.052526
parágraf     0.045029
cas          0.042661
dtype: float64
Theme: Union
empreg          0.070081
trabalh         0.069197
empres          0.066717
sindicat        0.060030
descont         0.051974
acord           0.042597
cent            0.038997
profissional    0.037310
dev             0.037268
recolh          0.036004
dtype: float64
Theme: Safety / Injury / Disability
empreg     0.090513
empres     0.073488
trabalh    0.059857
dev        0.048508
dias       0.043200
acident    0.042951
cip        0.041360
fornec     0.034808
risc       0.033132
equip      0.032929