In [2]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile

# more stop words, valid/not valid, acordo coletivo and not ammendments (extrato)

In [3]:
# sets the output directory
cba_path = os.path.join(".", "clause_data")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
# file_path = os.getcwd() + '/cbas'
file_path = '/Users/calvineng/Dropbox/Calvin_Eng/cba_text_analysis/cba_txt_2009'

In [4]:
# theme and translation dictionaries for clause_groups
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
translation_dict = clause_groups['Translation'].to_dict()
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

# retrieves the type of document
def extract_document_type(file_path):
    with io.open(file_path, 'r') as f:
        lines = (line.strip() for line in f)   
        title_start_flage = dropwhile(lambda line: '<STARTofTITLE>' not in line, lines)
        next(title_start_flage,"")
        title_end_flag = takewhile(lambda line: '<ENDofTITLE>' not in line, title_start_flage)
        title = ''.join(title_end_flag).strip()
        if 'Extrato Acordo Coletivo' in title:
            acordo, extrato = 1, 1
        elif 'Extrato Convenção Coletiva' in title:
            acordo, extrato = 0, 1
        elif 'Extrato Termo Aditivo de Acordo Coletivo' in title:
            acordo, extrato = 1, 0
        elif 'Extrato Termo Aditivo de Convenção Coletiva' in title:
            acordo, extrato = 0, 0
        else:
            acordo, extrato = '', ''

    return acordo, extrato

# retrieves the validity
def extract_validity(file_path):
    with io.open(file_path, 'r') as f:
        lines = (line.strip() for line in f) 
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = ''.join(validity_end_flag).strip()
        if 'carimbo' in validity:
            validity = 1
        elif 'semvalorlegal' in validity:
            validity = 0
        else:
            validity = ''

    return validity

# extracts the types of clauses present
def extract_clause_names(file_path):
    with io.open(file_path, 'r') as f:
        names = []
        lines = (line.strip() for line in f)      
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        for line in clause_flag_end:
            if not line: 
                continue
            title = line.split('|')[0]
            if title not in theme_dict:
                continue
            translation = translation_dict[title]
            names.append(translation)

    return names

# extracts the text of clauses
def extract_clause_texts(file_path):
    with io.open(file_path, 'r') as f:
        text = []
        texts = []
        lines = (line.strip() for line in f)  
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        for line in text_flag_start:
            if '|' in line: 
                text.append(line.split('|')[0])
                texts.append(('').join(text).replace('\xa0','').strip().lower())
                text = [line.split('|')[1]]
            else:
                text.append(line)
        if text:
            texts.append(('').join(text).replace('\xa0','').strip().lower())

    return texts

In [5]:
def output_all(file_path_x, files_x):
    # only considers files with start dates 2008-2017
    if files_x[0:4].isdigit() and 2008 <= int(files_x[0:4]) <= 2017:
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4]) != 11:
            pass

        # extracts information from document
        file_path = os.path.join(file_path_x, files_x)
        acordo, extrato = extract_document_type(file_path)
        validity = extract_validity(file_path)
        names = extract_clause_names(file_path)
        texts = extract_clause_texts(file_path)

        # saves info for contract as a single new line
        pairs = [(contract_id + [acordo, extrato, validity, name, text]) for name, text in zip(names, texts)]
        with io.open(path_txt, 'a', encoding='utf8') as f:
            for pair in pairs:
                pair_line = '|'.join(str(x) for x in pair)
                f.write(pair_line + '\n')

In [6]:
# rewrites output file
path_txt = os.path.join(cba_path, "clause_text.csv")
with io.open(path_txt,'w',encoding='utf8') as f:
    header = 'contract_id|acordo|extrato|validity|title|text'
    f.write(header + '\n')

# loops over each contract
for idx, files in enumerate(os.listdir(file_path)):
    if idx % 1000 == 0:
        print("Looping through file ", files)
    output_all(file_path, files)

Looping through file  2009_11_01__2010_033261.txt
Looping through file  2009_06_01__2010_009417.txt
Looping through file  2009_10_01__2009_057913.txt
Looping through file  2009_09_01__2010_055133.txt
Looping through file  2009_11_01__2009_061078.txt
Looping through file  2009_03_20__2009_040361.txt
Looping through file  2009_04_01__2009_030683.txt
Looping through file  2009_05_01__2009_040365.txt
Looping through file  2009_04_01__2009_014862.txt
Looping through file  2009_04_01__2009_010972.txt
Looping through file  2009_01_01__2009_052824.txt
Looping through file  2009_05_01__2009_023162.txt
Looping through file  2009_08_01__2009_064476.txt
Looping through file  2009_12_18__2010_051882.txt
Looping through file  2009_02_06__2009_003866.txt
Looping through file  2009_05_01__2009_018184.txt
Looping through file  2009_09_22__2009_046008.txt
Looping through file  2009_06_01__2009_031139.txt
Looping through file  2009_04_01__2009_043786.txt
Looping through file  2009_03_24__2009_031070.txt


In [8]:
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('portuguese'))
stemmer = SnowballStemmer('portuguese')
translator = str.maketrans('', '', string.punctuation)

# adds custom stop words
custom_stop_words = ['parágrafo', 'nº', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
stop_words.extend(custom_stop_words)

def clean_text(text):
    if not isinstance(text, str):
        return ''
    tokens = word_tokenize(text, language='portuguese')
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [word.translate(translator) for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    cleaned_text = ' '.join(tokens).lower()
    return cleaned_text

# import spacy

# # lemmatization package
# nlp = spacy.load('pt_core_news_sm')
# stop_words = nlp.Defaults.stop_words

# # adds custom stop words
# custom_stop_words = ['parágrafo', 'nº', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
# stop_words.extend(custom_stop_words)

# def preprocess_text(text):
#     try: 
#         doc = nlp(text)
#     except:
#         return ''
#     tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
#     return ' '.join(tokens)

# reads file as csv
df = pd.read_csv(f'clause_data/clause_text.csv', sep='|', nrows=10000)

# cleans the text
for i in range(len(df)):
    if i % 1000 == 0:
        print(i)
    df.at[i, 'clean_text'] = clean_text(df.at[i, 'text'])

from sklearn.feature_extraction.text import TfidfVectorizer

# calculatues TFIDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)

# selects 20 tokens with highest TFIDF
tfidf_cols = [col for col in df.columns if col not in ['text', 'clean_text']]
tfidf_means = df.select_dtypes(include=['float64']).mean()
top_twenty = tfidf_means.nlargest(20)

# prints the top 20 words
print(f'Top 20 words for "":')
print(list(top_twenty.index))
print()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/calvineng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/calvineng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
Top 20 words for "":
['empreg', 'trabalh', 'empres', 'hor', 'salári', 'acord', 'dias', 'dev', 'fic', 'cent', 'part', 'dia', 'pagament', 'sindicat', 'valor', 'cas', 'colet', 'present', 'servic', 'pod']



In [None]:
for theme in themes:
    # reads file as csv
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
    df = pd.read_csv(f'clause_data/{file_name}_text.csv', sep='|')

    # selects 20 tokens with highest TFIDF
    tfidf_cols = [col for col in df.columns if col not in ['text', 'clean_text']]
    tfidf_means = df.select_dtypes(include=['float64']).mean()
    top_twenty = tfidf_means.nlargest(20)

    # prints the top 20 words
    print(f'Top 20 words for "{theme}":')
    print(list(top_twenty.index))
    print()

Top 20 words for "Wages":
['salário', 'hora', 'ser', 'empregado', 'dia', 'pagamento', 'empresa', 'salarial', 'trabalho', 'real', 'pagar', 'cláusula', 'adiantamento', 'ficar', 'compensação', 'acordo', 'reajuste', 'piso', 'desconto', 'efetuar']

Top 20 words for "Health":
['médico', 'empregado', 'empresa', 'ser', 'exame', 'odontológico', 'atestado', 'convênio', 'profissional', 'caso', 'serviço', 'trabalho', 'empregador', 'sindicato', 'saúde', 'dia', 'plano', 'reconhecer', 'estar', 'fornecer']

Top 20 words for "Union":
['empresa', 'sindicato', 'empregado', 'ser', 'dia', 'trabalho', 'profissional', 'sindical', 'acordo', 'contribuição', 'desconto', 'categoria', 'entidade', 'descontar', 'trabalhador', 'coletivo', 'salário', 'empregador', 'pagamento', 'prazo']

Top 20 words for "Safety / Injury / Disability":
['empresa', 'empregado', 'dia', 'trabalho', 'ser', 'cipa', 'acidente', 'equipamento', 'proteção', 'risco', 'uso', 'prazo', 'fornecer', 'dever', 'sindicato', 'individual', 'ficar', 'caso