## Converting TXT files to CSV files

Set the directories for the input and output

In [2]:
import os

# sets the output directory
cba_path = os.path.join(".", "clause_data")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
# file_path = os.getcwd() + '/cbas'
file_path = '/Users/calvineng/Dropbox/Calvin_Eng/cba_text_analysis/cba_txt_2009'

Create dictionaries to store the translations and subgroups of clauses

In [3]:
import pandas as pd

# theme and translation dictionaries for clause_groups
clause_groups = pd.read_csv('clause_groups/clause_groups_NEW.csv', index_col='cl_subgrp_pt')
translation_dict = clause_groups['cl_subgrp_en'].to_dict()
translations = list(map(str, clause_groups['cl_subgrp_en'].unique()))
subgroup_dict = clause_groups['cl_grp_en'].to_dict()
subgroups = list(map(str, clause_groups['cl_grp_en'].unique()))
print(translations)
print(subgroups)

['13th month bonus', 'Acceptance of medical certificates', 'Access to company information', 'Union access to workplace', 'Accompaniment of work-related injured worker', 'Adaptation of work functions', 'Night pay', 'Overtime pay', 'Hazard pay (health risk)', 'Shift pay', 'Hazard pay (danger risk)', 'On-call pay', 'Seniority pay', 'Subsistence allowance', 'Application of the CBA', 'Retirement', 'Moral harassment', 'Sexual harassment', 'Assignment to (deviation from) work functions', 'Work authorization on Sundays and holidays', 'Food assistance', 'Childcare assistance', 'Illness/disability assistance', 'Education assistance', 'Housing assistance', 'Maternity assistance', 'Death/funeral assistance', 'Health assistance', 'Transportation assistance', 'Performance evaluation', 'Advance notice', 'CIPA: accident prevention committee', 'Health education campaigns', 'Factory commission', 'Fees', 'Workday compensation', 'Working environment conditions', 'Part-time contracts', 'Union fees', 'Workd

Functions that extract various details from collective bargaining agreements

In [4]:
import io
from itertools import dropwhile, takewhile

# retrieves the type of document
def extract_document_type(file_path):
    with io.open(file_path, 'r') as f:
        lines = (line.strip() for line in f)   
        title_start_flage = dropwhile(lambda line: '<STARTofTITLE>' not in line, lines)
        next(title_start_flage,"")
        title_end_flag = takewhile(lambda line: '<ENDofTITLE>' not in line, title_start_flage)
        title = ''.join(title_end_flag).strip()
        if 'Extrato Acordo Coletivo' in title:
            acordo, extrato = 1, 1
        elif 'Extrato Convenção Coletiva' in title:
            acordo, extrato = 0, 1
        elif 'Extrato Termo Aditivo de Acordo Coletivo' in title:
            acordo, extrato = 1, 0
        elif 'Extrato Termo Aditivo de Convenção Coletiva' in title:
            acordo, extrato = 0, 0
        else:
            acordo, extrato = '', ''

    return acordo, extrato

# retrieves the validity
def extract_validity(file_path):
    with io.open(file_path, 'r') as f:
        lines = (line.strip() for line in f) 
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = ''.join(validity_end_flag).strip()
        if 'carimbo' in validity:
            validity = 1
        elif 'semvalorlegal' in validity:
            validity = 0
        else:
            validity = ''

    return validity

# extracts the types of clauses present
def extract_clause_names(file_path):
    with io.open(file_path, 'r') as f:
        names = []
        subgroups = []
        lines = (line.strip() for line in f)      
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        for line in clause_flag_end:
            if not line: 
                continue
            try: 
                title = line.split('|')[0]
                translation = translation_dict[title]
                subgroup = subgroup_dict[title]
            except:
                translation = ''
                subgroup =''
            names.append(translation)
            subgroups.append(subgroup)

    return names, subgroups

# extracts the text of clauses
def extract_clause_texts(file_path):
    with io.open(file_path, 'r') as f:
        text = []
        texts = []
        lines = (line.strip() for line in f)  
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        for line in text_flag_start:
            if '|' in line: 
                text.append(line.split('|')[0])
                texts.append(('').join(text).replace('\xa0','').strip())
                text = [line.split('|')[1]]
            else:
                text.append(line)
        if text:
            texts.append(('').join(text).replace('\xa0','').strip())

    return texts

Function that outputs information from collective bargaining agreements in the form of a CSV with File I/O

In [5]:
def output_all(file_path_x, files_x):
    # only considers files with start dates 2008-2017
    if files_x[0:4].isdigit() and 2008 <= int(files_x[0:4]) <= 2017:
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4]) != 11:
            pass

        # extracts information from document
        file_path = os.path.join(file_path_x, files_x)
        acordo, extrato = extract_document_type(file_path)
        validity = extract_validity(file_path)
        names, subgroups = extract_clause_names(file_path)
        texts = extract_clause_texts(file_path)

        # saves info for contract as a single new line
        pairs = [(contract_id + [acordo, extrato, validity, name, subgroup, text]) for name, subgroup, text in zip(names, subgroups, texts)]
        with io.open(path_txt, 'a', encoding='utf8') as f:
            for pair in pairs:
                pair_line = '|'.join(str(x) for x in pair)
                f.write(pair_line + '\n')

Loop through collective bargaining agreements to create the output for the CSV file

In [43]:
# rewrites output file
path_txt = os.path.join(cba_path, "contract_clauses.csv")
with io.open(path_txt,'w',encoding='utf8') as f:
    header = 'contract_id|acordo|extrato|validity|name|subgroup|text'
    f.write(header + '\n')

# loops over each contract
for idx, files in enumerate(os.listdir(file_path)):
    if idx % 1000 == 0:
        print("Looping through file ", files)
    output_all(file_path, files)

Looping through file  2009_11_01__2010_033261.txt
Looping through file  2009_06_01__2010_009417.txt
Looping through file  2009_10_01__2009_057913.txt
Looping through file  2009_09_01__2010_055133.txt
Looping through file  2009_11_01__2009_061078.txt
Looping through file  2009_03_20__2009_040361.txt
Looping through file  2009_04_01__2009_030683.txt
Looping through file  2009_05_01__2009_040365.txt
Looping through file  2009_04_01__2009_014862.txt
Looping through file  2009_04_01__2009_010972.txt
Looping through file  2009_01_01__2009_052824.txt
Looping through file  2009_05_01__2009_023162.txt
Looping through file  2009_08_01__2009_064476.txt
Looping through file  2009_12_18__2010_051882.txt
Looping through file  2009_02_06__2009_003866.txt
Looping through file  2009_05_01__2009_018184.txt
Looping through file  2009_09_22__2009_046008.txt
Looping through file  2009_06_01__2009_031139.txt
Looping through file  2009_04_01__2009_043786.txt
Looping through file  2009_03_24__2009_031070.txt


Filter the data for valid documents

In [6]:
import pandas as pd

# reads file as csv
df = pd.read_csv('clause_data/contract_clauses.csv', sep="|")

# keeps only valid ACTs
df = df.loc[(df['acordo'] == 1)&(df['extrato'] == 1)&(df['validity'] == 1)]
df = df.drop(['acordo', 'extrato', 'validity', 'subgroup'], axis=1)
df = df.dropna()

# reindexes the dataframe with the default integer index
df = df.reset_index(drop=True)

print(len(df.index))
df.head()

601449


Unnamed: 0,contract_id,name,text
0,2009_055971,Minimum wage,Ficará garantido ao empregado motorista o valo...
1,2009_055971,Salary payment - means and timeframes,"Para as funções de motorista de carreta, bi-tr..."
2,2009_055971,Salary deductions,"Qualquer multa por excesso de velocidade, por ..."
3,2009_055971,Food assistance,Os empregados motoristas externos receberão me...
4,2009_055971,Food assistance,Levando-se em conta a crise econômica e a redu...


SnowballStemmer for Portuguese and ntlk package for stopwords

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')

# stop words
stop_words = set(stopwords.words('portuguese'))

# adds custom stop words
custom_stop_words = ['parágrafo', 'nº', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x',
                     'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi',
                     'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'number',
                     'clt', 'artigo']
stop_words.update(custom_stop_words)

# stemmer
stemmer = SnowballStemmer('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/calvineng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/calvineng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Transformer to preprocess documents with stemming

In [8]:
import re
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CBApreprocess(BaseEstimator, TransformerMixin):
    def __init__(self, preprocess=True, remove_punctuation=True,
                 replace_numbers=True, remove_stopwords=True, stemming=True):
        self.preprocess = preprocess
        self.remove_punctuation = remove_punctuation
        self.replace_numbers = replace_numbers
        self.remove_stopwords = remove_stopwords
        self.stemming = stemming
        
    def preprocess_text(self, text):
        if self.remove_punctuation:
            text = re.sub(r'[^\w\s]|º', '', text)
        if self.replace_numbers:
            text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
        if self.remove_stopwords:
            words = text.split()
            words = [word for word in words if word.lower() not in stop_words]
            text = ' '.join(words)
        if self.stemming and stemmer is not None:
            words = text.split()
            words = [stemmer.stem(word) for word in words]
            text = ' '.join(words)
        return text.lower()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = [
            self.preprocess_text(cba)
            for cba in X
        ]
        return np.array(X_transformed)

Test the preprocessing with stemming on the first five documents in the dataframe

In [9]:
X_few = df['text'].iloc[:5]
cba_preprocessor = CBApreprocess()
X_few_processed = cba_preprocessor.fit_transform(X_few)
print(X_few_processed)
print(X_few.values)

['fic garant empreg motor valor pis salarial categor cas venh trabalh integral mês eou fiqu disposiçã consig ating pis salarial valor comissõ cas trabalh parcial pagament pis proporcional'
 'funçõ motor carret bitr tritr rod trem treminhã simil fic acert remuner comissõ pur valor equivalent comissõ propri dit eos reflex repous seman remuner feri som total cinc virgul setent cinc cent fatur liqu caminhã transport realiz dentr estad min geraisparágraf primeir part consid fatur líqu valor fret brut deduçõ valor impost agenc carg pedági send veícul carregadoparágraf segund fic acert cinc virgul setent cinc cent percentual comissõ acert sext referes pagament reflex comissõ sobr repous seman remuner eventu feri send recib pagamento valor refer percentu desmembr títul comissõ comissõ sobr rsrs sobr feri aparec respect discrimin separadasparágraf terceir fic acert transport realiz estad min ger som percetnu comissõ reflex comissõ sobr repous seman remuner feri result seis virgul setent cinc ce

Transformer to calculate the TFIDF for documents

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

class CBAToTFIDFTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self
    def transform(self, X, y=None):
        X_tfidf = self.vectorizer.transform(X)
        return X_tfidf.toarray()

Test the TFIDF on the first five documents in the dataframe

In [15]:
X_few_wordcounts = CBAToTFIDFTransformer().fit_transform(X_few_processed)
X_few_wordcounts

array([[0.18216425, 0.3643285 , 0.18216425, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.09801016, 0.04900508,
        0.04900508]], dtype=float32)

Lemmatizer for Portuguese from SpaCy and stopwords 

In [16]:
import spacy

# lemmatizer
nlp = spacy.load("pt_core_news_sm")

# stop words
stop_words = nlp.Defaults.stop_words

# adds custom stop words
custom_stop_words = ['parágrafo', 'nº', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x',
                     'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi',
                     'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'number',
                     'clt', 'artigo']
stop_words.update(custom_stop_words)

Transformer to preprocess documents with lemmatization

In [17]:
class CBAPreprocessLemmatize(BaseEstimator, TransformerMixin):
    def __init__(self, preprocess=True, remove_punctuation=True,
                 replace_numbers=True, remove_stopwords=True, lemmatize=True):
        self.preprocess = preprocess
        self.remove_punctuation = remove_punctuation
        self.replace_numbers = replace_numbers
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        
    def preprocess_text(self, text):
        if self.remove_punctuation:
            text = re.sub(r'[^\w\s]|º', '', text)
        if self.replace_numbers:
            text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
        if self.remove_stopwords:
            words = text.split()
            words = [word for word in words if word.lower() not in stop_words]
            text = ' '.join(words)
        if self.lemmatize:
            doc = nlp(text)
            words = [token.lemma_ for token in doc]
            text = ' '.join(words)
        return text.lower()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = [
            self.preprocess_text(cba)
            for cba in X
        ]
        return np.array(X_transformed)

Test the preprocessing with lemmatization on the first five documents in the dataframe

In [18]:
X_few = df['text'].iloc[:5]
cba_preprocessor = CBAPreprocessLemmatize()
X_few_processed = cba_preprocessor.fit_transform(X_few)
print(X_few_processed)
print(X_few.values)

['ficar garantir empregar motorista piso salarial categor caso vir trabalhar integralmente ear fique disposição consiga atingir piso salarial valor comissão caso trabalho parcial pagamento piso ser proporcional'
 'função motorista carreta bitr tritr rodo tr treminhão similar ficar acertar remuneração ser comissão pur equivalente comissão propriamente dizer eos reflexo repouso semanal remunerado feriado somar total virgula setenta faturamento liquido caminhão transporte realizar minas geraisparágrafo parte considerar faturamento líquido valor frete bruto dedução valor imposto agenciamento carga pedágio ser veículo estar carregadoparágrafo fica acertar virgula setenta percentual comissão acertar refereser pagamento reflexo comissão repouso semanal remunerado eventual feriado ser recibo pagamentoos valor referir percentual ser desmembrar título comissão comissão rsrs feriar aparecer respectivo discriminação separadasparágrafo fica acertar transporte realizar minas gerais somar percetnual 

Top 20 words for each clause type using stemming

In [20]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("cba_preprocess", CBApreprocess()),
    ("cba_to_tfidf", CBAToTFIDFTransformer())
])

# number of words to output and groupings
groups = df.groupby('name')
n_top_words = 20

for name, group in groups:
    data_texts = group['text'].tolist()
    data_tfidf = preprocess_pipeline.fit_transform(data_texts)
    mean_tfidf = data_tfidf.mean(axis=0)
    top_indices = np.argsort(mean_tfidf)[-n_top_words:]
    vocab = preprocess_pipeline.named_steps['cba_to_tfidf'].vectorizer.get_feature_names_out()
    top_words = [vocab[idx] for idx in top_indices]
    print(f"Top {n_top_words} words for '{name}': {top_words}")

AttributeError: 'Pipeline' object has no attribute 'dictionary'

Top 20 words for each clause type using lemmatization

In [1]:
from sklearn.pipeline import Pipeline

# creates Pipeline to preprocess and calculate TF-IDF
preprocess_pipeline = Pipeline([
    ("cba_preprocess", CBAPreprocessLemmatize()),
    ("cba_to_tfidf", CBAToTFIDFTransformer())
])

# number of words to output and groupings
groups = df.groupby('name')
n_top_words = 20

for name, group in groups:
    data_texts = group['text'].tolist()
    data_tfidf = preprocess_pipeline.fit_transform(data_texts)
    mean_tfidf = data_tfidf.mean(axis=0)
    top_indices = np.argsort(mean_tfidf)[-n_top_words:]
    vocab = preprocess_pipeline.named_steps['cba_to_tfidf'].vectorizer.get_feature_names_out()
    top_words = [vocab[idx] for idx in top_indices]
    print(f"Top {n_top_words} words for '{name}': {top_words}")

NameError: name 'CBAPreprocessLemmatize' is not defined

Top 20 words for each clause subgroup using stemming

In [13]:
from sklearn.pipeline import Pipeline

# creates Pipeline to preprocess and calculate TF-IDF
preprocess_pipeline = Pipeline([
    ("cba_preprocess", CBApreprocess()),
    ("cba_to_tfidf", CBAToTFIDFTransformer())
])

# number of words to output and groupings
groups = df.groupby('subgroup')
n_top_words = 20

for name, group in groups:
    data_texts = group['text'].tolist()
    data_tfidf = preprocess_pipeline.fit_transform(data_texts)
    mean_tfidf = data_tfidf.mean(axis=0)
    top_indices = np.argsort(mean_tfidf)[-n_top_words:]
    vocab = preprocess_pipeline.named_steps['cba_to_tfidf'].vectorizer.get_feature_names_out()
    top_words = [vocab[idx] for idx in top_indices]
    print(f"Top {n_top_words} words for '{name}': {top_words}")

Top 20 words for 'Assistances': ['acord', 'fic', 'pagament', 'dependent', 'benefíci', 'conced', 'rea', 'val', 'auxíli', 'aliment', 'pag', 'cas', 'transport', 'ser', 'fornec', 'salári', 'trabalh', 'empres', 'empreg', 'numb']
Top 20 words for 'Bonuses': ['ano', 'receb', 'caix', 'promoçã', 'funçã', 'adiant', 'gratific', 'cinquent', 'pagament', 'parcel', 'dia', 'dias', 'pag', 'trabalh', 'ser', 'fér', 'empres', 'salári', 'empreg', 'numb']
Top 20 words for 'Contract types': ['estagiári', 'acord', 'lei', 'aprendizag', 'períod', 'mesm', 'funçã', 'superior', 'estági', 'praz', 'ser', 'salári', 'dias', 'aprendiz', 'trabalh', 'empres', 'experient', 'empreg', 'contrat', 'numb']
Top 20 words for 'Employment protections': ['just', 'provisór', 'afast', 'anos', 'gestant', 'salári', 'períod', 'aposentador', 'milit', 'praz', 'assegur', 'fic', 'empres', 'trabalh', 'estabil', 'garant', 'servic', 'dias', 'numb', 'empreg']
Top 20 words for 'General provisions': ['dirim', 'descumpr', 'justic', 'salári', 'esta

: 

: 

Top 20 words for each clause subgroup using lemmatization

In [13]:
from sklearn.pipeline import Pipeline

# creates Pipeline to preprocess and calculate TF-IDF
preprocess_pipeline = Pipeline([
    ("cba_preprocess", CBAPreprocessLemmatize()),
    ("cba_to_tfidf", CBAToTFIDFTransformer())
])

# number of words to output and groupings
groups = df.groupby('subgroup')
n_top_words = 20

for name, group in groups:
    data_texts = group['text'].tolist()
    data_tfidf = preprocess_pipeline.fit_transform(data_texts)
    mean_tfidf = data_tfidf.mean(axis=0)
    top_indices = np.argsort(mean_tfidf)[-n_top_words:]
    vocab = preprocess_pipeline.named_steps['cba_to_tfidf'].vectorizer.get_feature_names_out()
    top_words = [vocab[idx] for idx in top_indices]
    print(f"Top {n_top_words} words for '{name}': {top_words}")

Top 20 words for 'Assistances': ['filho', 'conceder', 'dependente', 'pagamento', 'benefício', 'real', 'alimentação', 'fornecer', 'auxílio', 'pagar', 'trabalho', 'transporte', 'caso', 'empregado', 'dia', 'salário', 'ser', 'empregar', 'empresa', 'number']
Top 20 words for 'Bonuses': ['ocasião', 'ficar', 'adiantamento', 'cinqüentar', 'função', 'gratificação', 'promoção', 'trabalho', 'ano', 'parcela', 'pagamento', 'empregado', 'pagar', 'férias', 'dia', 'ser', 'empregar', 'empresa', 'salário', 'number']
Top 20 words for 'Contract types': ['estagiário', 'mesmo', 'período', 'aprendizagem', 'lei', 'trabalho', 'contratação', 'ter', 'aprendiz', 'estágio', 'função', 'prazo', 'salário', 'dia', 'empregar', 'ser', 'empresa', 'experiência', 'contrato', 'number']
Top 20 words for 'Employment protections': ['provisório', 'aposentadoria', 'gestante', 'garantir', 'assegurar', 'período', 'militar', 'salário', 'prazo', 'trabalho', 'ser', 'ano', 'ficar', 'dia', 'empresa', 'emprego', 'estabilidade', 'serviço

: 

: 