In [49]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [50]:
# sets the output directory
cba_path = os.path.join(".", "clause_analysis")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cbas'

In [51]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
translation_dict = clause_groups['Translation'].to_dict()
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_clauses(file_path, clause_type):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  

        # retrieves the validity
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = '\n'.join(validity_end_flag).strip()
    
        # extracts the types of clauses present
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        themes = []
        titles = []
        for line in clause_flag_end:
            if not line: 
                continue  
            title = line.split('|')[0]
            translation = translation_dict[title]
            titles.append(translation)
            theme = theme_dict[title]
            themes.append(theme)

        # extracts the text of clauses
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        texts = []
        text = []
        for line in text_flag_start:
            if '|' in line: 
                text.append(line.split('|')[0])
                texts.append((' ').join(text))
                text = []
            else:
                text.append(line)
        if text:
            texts.append((' ').join(text))

        # retains clauses of proper type
        indices_of_type = [i for i, theme in enumerate(themes) if theme == clause_type]
        titles_of_type = [titles[i] for i in indices_of_type]
        texts_of_type = [texts[i] for i in indices_of_type]

        return validity, titles_of_type, texts_of_type

In [52]:
def output_all(file_path_x, files_x, clause_type):
    # only considers files with start dates 2008-2017
    if files_x[0:4].isdigit() and 2008 <= int(files_x[0:4]) <= 2017:
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4]) != 11:
            pass
        validity, titles, texts = extract_clauses(os.path.join(file_path_x, files_x), clause_type)
        # saves info for contract as a single new line
        pairs = [(contract_id + [validity, title, text]) for title, text in zip(titles, texts)]
        with io.open(path_txt, 'a', encoding='utf8') as f:
            for pair in pairs:
                pair_line = '|'.join(str(x) for x in pair)
                f.write(pair_line + '\n')

In [53]:
for theme in themes: 
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')

    # rewrites output file
    path_txt = os.path.join(cba_path, f"{file_name}_text.csv")
    with io.open(path_txt,'w',encoding='utf8') as f:
        header = 'contract_id|validity|title|text'
        f.write(header + '\n')

    # loops over each contract
    [output_all(file_path, files, theme) for files in os.listdir(file_path)]

In [54]:
nlp = spacy.load('pt_core_news_sm')
stop_words = nlp.Defaults.stop_words

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

for theme in themes:
    # reads file as csv
    file_name = theme.lower().replace(' / ', '_').replace(' ', '_')
    df = pd.read_csv(f'clause_analysis/{file_name}_text.csv', sep='|')

    # cleans the text
    df['clean_text'] = df['text'].apply(preprocess_text)

    # calculatues TFIDF
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['clean_text'])
    tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    df = pd.concat([df, tfidf_df], axis=1)

    # splits the data for training and testing
    y = (df['validity']=='carimbo').to_numpy()
    y = df['validity']
    X = X.toarray()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # trains a logistic regression model with L1 penalty and liblinear solver
    logreg = LogisticRegression(penalty='l1', solver='liblinear')

    # finds the best value of C 
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    grid = GridSearchCV(logreg, param_grid, cv=2)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    # prints the best value of C and the test accuracy
    print(f"Theme: {theme}")
    print(f"Best value of C: {best_model.C}")
    print(f"Test accuracy: {best_model.score(X_test, y_test)}")

    # gets the top indicative features
    feature_names = vectorizer.get_feature_names_out()
    coefs = best_model.coef_[0]
    top_indicative_features = [feature_names[i] for i in coefs.argsort()[-20:][::-1]]
    print("Top indicative features:", top_indicative_features)

    # saves to csv
    df.to_csv(f'clause_analysis/{file_name}_text.csv', sep='|', index=False)

Theme: Wages
Best value of C: 100
Test accuracy: 0.6285714285714286
Top indicative features: ['ocorrer', 'parágrafo', 'comprovantes', 'idade', 'ão', 'quarenta', 'trinto', 'teto', 'média', 'salario', 'empregador', 'admissão', 'feira', 'curso', 'cumprimento', 'assuma', 'salvo', 'motorista', 'cláusula', 'abono']
Theme: Health
Best value of C: 0.001
Test accuracy: 0.625
Top indicative features: ['útil', 'documento', 'doenças', 'drogaria', 'duração', 'duzento', 'econômica', 'efeito', 'efetivo', 'efetuar', 'eficácia', 'elaborar', 'eletrônico', 'em', 'email', 'emergência', 'emissão', 'emitente', 'emitido', 'emitir']
Theme: Union
Best value of C: 10
Test accuracy: 0.6153846153846154
Top indicative features: ['empregador', 'escrito', 'titular', 'cidade', 'disposto', 'ão', 'envolver', 'salarial', 'surgir', 'parágrafo', 'haver', 'data', 'sindical', 'recolhir', 'qualquer', 'imposto', 'confederativa', 'disposição', 'pagamento', 'demissão']
Theme: Safety / Injury / Disability
Best value of C: 0.001




Theme: Work Time
Best value of C: 0.001
Test accuracy: 0.7428571428571429
Top indicative features: ['útil', 'domingo', 'doméstico', 'drástico', 'dsr', 'dupla', 'durante', 'duração', 'duzento', 'débito', 'décimo', 'econômica', 'econômico', 'educacional', 'educação', 'efeito', 'efetivamente', 'efetivar', 'efetivação', 'efetiver']




Theme: Incentives
Best value of C: 100
Test accuracy: 0.8
Top indicative features: ['mesmo', 'certidão', 'ela', 'dezembro', 'dia', 'diferença', 'direito', 'disponivel', 'dobro', 'documentos', 'durante', 'efeito', 'único', 'devidamente', 'elegibilidade', 'empregada', 'empregado', 'empregador', 'empregadora', 'empregar']
Theme: Food / Education / Housing
Best value of C: 0.001
Test accuracy: 0.75
Top indicative features: ['útil', 'empregadora', 'empregado', 'empregada', 'emitir', 'eletrônico', 'ele', 'efetuar', 'efetivo', 'efetivamente', 'efeito', 'educação', 'edição', 'décimo', 'décima', 'duzento', 'duração', 'durante', 'dou', 'dotar']




Theme: Contract Agreement
Best value of C: 0.001
Test accuracy: 0.7777777777777778
Top indicative features: ['único', 'escrito', 'empregador', 'empregados', 'empregar', 'emprego', 'empresa', 'empresas', 'ensejar', 'entidade', 'envolver', 'equipamento', 'equivalente', 'esclarecimento', 'especificado', 'item', 'especificar', 'específico', 'estabelecem', 'estabelecer']
Theme: Retirement
Best value of C: 1000
Test accuracy: 0.25
Top indicative features: ['garantia', 'estabilidade', 'trabalhador', 'permanente', 'rescisão', 'normativo', 'ão', 'época', 'parágrafo', 'invalidez', 'documento', 'devir', 'documentação', 'efetuar', 'durante', 'dispensar', 'dispensa', 'discussão', 'efeito', 'direito']




Theme: Work Environment / Harassment
Best value of C: 100
Test accuracy: 0.6
Top indicative features: ['trabalhador', 'condição', 'exigir', 'pessoal', 'mesmo', 'ficar', 'feminino', 'empregador', 'potável', 'ônus', 'utilizar', 'exigência', 'exclusivo', 'exigido', 'exiger', 'filtrar', 'exemplarmente', 'formulário', 'expor', 'estabelecimento']




Theme: Family
Best value of C: 10
Test accuracy: 0.4444444444444444
Top indicative features: ['correspondente', 'hipótese', 'conforme', 'conceder', 'hora', 'seguro', 'ano', 'pedir', 'setenta', 'diário', 'divulgação', 'disponibilizar', 'doação', 'distrito', 'dobro', 'disposto', 'documentalista', 'disponível', 'útil', 'dispensar']
Theme: Dismissals / Transfers
Best value of C: 0.001
Test accuracy: 0.5833333333333334
Top indicative features: ['útil', 'dissolução', 'disposição', 'dispositivo', 'dispor', 'dispenso', 'dispensar', 'dispensado', 'dispensa', 'discussão', 'discriminar', 'dirimir', 'diretoria', 'diretamente', 'direito', 'dimensional', 'dificuldade', 'diferença', 'dia', 'disposto']


14 fits failed out of a total of 14.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/calvineng/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/calvineng/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1528, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/Users/calvineng/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 1143, in _fit_liblinear
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but th

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 'carimbo'