In [1]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
import pandas as pd

# reads file as csv
df = pd.read_csv('clause_data/contract_clauses.csv', sep="|")

# keeps only valid ACTs
df = df.loc[(df['acordo'] == 1)&(df['extrato'] == 1)]
df = df.drop(['acordo', 'extrato'], axis=1)
df = df.dropna()

# reindexes the dataframe with the default integer index
df = df.reset_index(drop=True)

print(len(df.index))
df.head()

878283


Unnamed: 0,contract_id,validity,name,text
0,2010_033261,0,Wage floors,I – Para jornada de trabalho de 36 (trinta e s...
1,2009_055971,1,Wage floors,Ficará garantido ao empregado motorista o valo...
2,2009_055971,1,Wage payment,"Para as funções de motorista de carreta, bi-tr..."
3,2009_055971,1,Wage deductions,"Qualquer multa por excesso de velocidade, por ..."
4,2009_055971,1,Food assistance,Os empregados motoristas externos receberão me...


In [3]:
# theme and translation dictionaries for clause_groups
clause_groups = pd.read_csv('clause_groups/clause_groups_NEW.csv', index_col='name_pt')
translation_dict = clause_groups['shortened_name_en'].to_dict()
translations = list(map(str, clause_groups['shortened_name_en'].unique()))
print(translations)

['Medical certificates', 'Union access to information', 'Union access to workplace', 'Work-related injuries', 'Work functions adaptation', 'Overtime pay', 'Hazard pay (health)', 'Night pay', 'Shift pay', 'Hazard pay (danger)', 'On-call pay', 'Seniority pay', 'Subsistence allowance', 'CBA enforcement', 'Retirement', 'Moral harassment', 'Sexual harassment', 'Work function assignment', 'Sundays and holidays', 'Food assistance', 'Childcare assistance', 'Illness assistance', 'Education assistance', 'Housing assistance', 'Maternity assistance', 'Funeral assistance', 'Health assistance', 'Transportation assistance', 'Performance evaluation', 'Advance notice', 'Health education campaigns', 'Accident prevention committee', 'Fees', 'Factory commission', 'Workday compensation', 'Working environment conditions', 'Workday controls', 'Union dues', 'Part-time', '13th month bonus', 'Separations', 'CBA non-compliance', 'Wage deductions', 'Weekly rest', 'Opposition to union dues', 'Vacation days and dur

In [4]:
name_counts = df.groupby('name')['contract_id'].nunique().sort_values(ascending=False)
top_five_names = name_counts.head(5).index.tolist()
print(name_counts.head())
print(top_five_names)

name
Other bargaining provisions    19714
Wage increases                 18208
Wage floors                    17097
Workday compensation           17094
Union dues                     16781
Name: contract_id, dtype: int64
['Other bargaining provisions', 'Wage increases', 'Wage floors', 'Workday compensation', 'Union dues']


In [5]:
import spacy

# lemmatizer
nlp = spacy.load("pt_core_news_sm")

# stop words
stop_words = nlp.Defaults.stop_words

# adds custom stop words
custom_stop_words = ['parágrafo', 'nº', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x',
                     'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi',
                     'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'number',
                     'clt', 'artigo']
stop_words.update(custom_stop_words)

In [6]:
import re
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


class CBAPreprocessLemmatize(BaseEstimator, TransformerMixin):
    def __init__(self, preprocess=True, remove_punctuation=True,
                 replace_numbers=True, remove_stopwords=True, lemmatize=True):
        self.preprocess = preprocess
        self.remove_punctuation = remove_punctuation
        self.replace_numbers = replace_numbers
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        
    def preprocess_text(self, text):
        if self.remove_punctuation:
            text = re.sub(r'[^\w\s]|º|ª', '', text)
        if self.replace_numbers:
            text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
        if self.remove_stopwords:
            words = text.split()
            words = [word for word in words if word.lower() not in stop_words]
            text = ' '.join(words)
        if self.lemmatize:
            doc = nlp(text)
            words = [token.lemma_ for token in doc]
            text = ' '.join(words)
        return text.lower()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = [
            self.preprocess_text(cba)
            for cba in X
        ]
        return np.array(X_transformed)

In [7]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

preprocess_pipeline = Pipeline([
    ("cba_preprocess", CBAPreprocessLemmatize()),
    ("cba_to_tfidf", TfidfVectorizer(ngram_range=(1, 2), min_df=0.0001))
])

groups = df[df['name'].isin(top_five_names)].groupby('name')

for name, group in groups:
    # applies preprocessing pipeline and calculates TFIDF
    X = preprocess_pipeline.fit_transform(group['text'])
    y = group['validity']
    feature_names = preprocess_pipeline.named_steps['cba_to_tfidf'].get_feature_names_out()

    # splits the data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=253)

    # trains a logistic regression model with L1 penalty and liblinear solver
    logreg = LogisticRegression(penalty='l1', solver='liblinear', random_state=253)

    # finds the best value of C 
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    grid = GridSearchCV(logreg, param_grid, cv=2)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    # prints the best value of C and the test accuracy
    print(f'Name: {name}')
    print(f'Best value of C: {best_model.C}')
    print(f'Test accuracy: {best_model.score(X_test, y_test)}')

    # gets the top indicative features
    coefs = best_model.coef_[0]
    top_indicative_features = [feature_names[i] for i in coefs.argsort()[-10:][::-1]]
    print(f'Top 10 Indicative Features for {name}:', top_indicative_features)

Name: Other bargaining provisions
Best value of C: 1
Test accuracy: 0.7051307914733373
Top 10 Indicative Features for Other bargaining provisions: ['incentivar', 'dispositivo pactuado', 'convocar', 'emprego conformidade', 'trabalho visar', 'ata assembléia', 'dia término', 'devido processo', 'processo prorrogação', 'excesso']
Name: Union dues
Best value of C: 1
Test accuracy: 0.6949380319546066
Top 10 Indicative Features for Union dues: ['enviar respectivo', 'empregado beneficiar', 'outubronumber contribuição', 'repassar décimo', 'recolhimento mensalidade', 'afixar assembléia', 'mobiliário', 'ônus', 'antecedêncer pequeno', 'alimentação']
Name: Wage floors
Best value of C: 1
Test accuracy: 0.7091019663490776
Top 10 Indicative Features for Wage floors: ['garantir empregar', 'percebir', 'ocorrência', 'sindicatos patronais', 'político', 'auxiliar produção', 'dispensar', 'acordo nenhum', 'perceber', 'trezento']
Name: Wage increases
Best value of C: 1
Test accuracy: 0.7064346865506766
Top 10 