In [1]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# sets the output directory
cba_path = os.path.join(".", "clause_analysis")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cbas'

In [3]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
translation_dict = clause_groups['Translation'].to_dict()
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_clauses(file_path):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  

        # retrieves the validity
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = '\n'.join(validity_end_flag).strip()
    
        # extracts the types of clauses present
        clause_flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(clause_flag_start,"")
        clause_flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, clause_flag_start)
        themes = []
        titles = []
        for line in clause_flag_end:
            if not line: 
                continue  
            title = line.split('|')[0]
            translation = translation_dict[title]
            titles.append(translation)
            theme = theme_dict[title]
            themes.append(theme)

        # extracts the text of clauses
        text_flag_start = dropwhile(lambda line: '<STARTofTEXT>' not in line, lines)
        next(text_flag_start, "")
        texts = []
        text = []
        for line in text_flag_start:
            if '|' in line: 
                text.append(line.split('|')[0])
                texts.append((' ').join(text))
                text = []
            else:
                text.append(line)
        if text:
            texts.append((' ').join(text))

        return validity, themes, titles, texts

In [4]:
def output_all(file_path_x, files_x):
    # only considers files with start dates 2008-2017
    if files_x[0:4].isdigit() and 2008 <= int(files_x[0:4]) <= 2017:
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4]) != 11:
            pass
        validity, themes, titles, texts = extract_clauses(os.path.join(file_path_x, files_x))
        # saves info for contract as a single new line
        pairs = [(contract_id + [validity, theme, title, text]) for theme, title, text in zip(themes, titles, texts)]
        with io.open(path_txt, 'a', encoding='utf8') as f:
            for pair in pairs:
                pair_line = '|'.join(str(x) for x in pair)
                f.write(pair_line + '\n')

In [5]:
# rewrites output file
path_txt = os.path.join(cba_path, "data.csv")
with io.open(path_txt,'w',encoding='utf8') as f:
    header = 'contract_id|validity|theme|title|text'
    f.write(header + '\n')

# loops over each contract
for idx, files in enumerate(os.listdir(file_path)):
    print("Looping through file ", files)
    output_all(file_path, files)

Looping through file  2016_09_01__2016_082054.txt
Looping through file  2011_11_01__2012_002993.txt
Looping through file  2014_01_01__2014_081501.txt
Looping through file  2017_12_01__2017_084835.txt
Looping through file  2017_12_01__2017_084809.txt
Looping through file  2013_11_15__2013_055346.txt
Looping through file  2009_01_01__2009_016497.txt
Looping through file  2015_06_16__2015_060659.txt
Looping through file  2018_05_01__2018_044118.txt
Looping through file  2012_05_01__2012_042451.txt
Looping through file  2011_11_01__2012_002943.txt
Looping through file  2016_09_01__2016_082084.txt
Looping through file  2013_11_14__2014_009174.txt
Looping through file  2009_01_01__2009_016731.txt
Looping through file  2015_05_01__2015_043073.txt
Looping through file  2011_11_01__2012_003082.txt
Looping through file  2015_12_16__2015_084042.txt
Looping through file  2017_12_01__2017_084934.txt
Looping through file  2013_06_01__2013_073146.txt
Looping through file  2017_03_01__2017_039221.txt


In [6]:
# lemmatization package
nlp = spacy.load('pt_core_news_sm')
stop_words = nlp.Defaults.stop_words

# adds custom stop words
custom_stop_words = ['parágrafo', 'nº', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']
for word in custom_stop_words:
    stop_words.add(word)

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# reads data into dataframe
df = pd.read_csv(f'clause_analysis/data.csv', sep='|')

# cleans the text
df['clean_text'] = df['text'].apply(preprocess_text)

# creates dummies for validity
validity_dummies = pd.get_dummies(df['validity'])
df = pd.concat([df, validity_dummies], axis=1)

# saves csv file for analysis
df.to_csv('clause_analysis/data.csv', sep='|')

# displays head of csv
df.head()

Unnamed: 0,contract_id,validity,theme,title,text,clean_text,carimbo,semvalorlegal
0,2016_082054,carimbo,Wages,Minimum Wage,"- SALÁRIOS NORMATIVOS Fica assegurado, para o...",salários normativos ficar assegurar empregado ...,1,0
1,2016_082054,carimbo,Wages,Salary Adjustments / Corrections,Os salários dos empregados (as) serão corrigi...,salário empregado ser corrigir percentual vírg...,1,0
2,2016_082054,carimbo,Wages,Salary Adjustments / Corrections,COMPENSAÇÕES Serão antes COMPENSADOS DA APLIC...,compensações serão compensados aplicação reaju...,1,0
3,2016_082054,carimbo,Wages,Salary Adjustments / Corrections,- ADMISSÃO APÓS DATA-BASE O reajuste salarial...,admissão reajuste salarial empregado admitir o...,1,0
4,2016_082054,carimbo,Wages,Salary Payment,- PAGAMENTO DE SALÁRIOS A) A empresa deverá p...,pagamento salários empresa proporcionar empreg...,1,0


In [7]:
for theme in themes: 
    # reads data into dataframe
    df = pd.read_csv(f'clause_analysis/data.csv', sep='|')

    # filters for theme
    df = df[df['theme'] == theme]
    print(df.head())

    # calculatues TFIDF
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['clean_text'])
    tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    df = pd.concat([df, tfidf_df], axis=1)

    # splits the data for training and testing
    y = df[df['theme'] == theme]['carimbo']
    X = X.toarray()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=253)

    # trains a logistic regression model with L1 penalty and liblinear solver
    logreg = LogisticRegression(penalty='l1', solver='liblinear', random_state=253)

    # finds the best value of C 
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    grid = GridSearchCV(logreg, param_grid, cv=2)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    # prints the best value of C and the test accuracy
    print(f"Theme: {theme}")
    print(f"Best value of C: {best_model.C}")
    print(f"Test accuracy: {best_model.score(X_test, y_test)}")

    # gets the top indicative features
    feature_names = vectorizer.get_feature_names_out()
    coefs = best_model.coef_[0]
    top_indicative_features = [feature_names[i] for i in coefs.argsort()[-20:][::-1]]
    print("Top indicative features:", top_indicative_features)

   Unnamed: 0  contract_id validity  theme                             title  \
0           0  2016_082054  carimbo  Wages                      Minimum Wage   
1           1  2016_082054  carimbo  Wages  Salary Adjustments / Corrections   
2           2  2016_082054  carimbo  Wages  Salary Adjustments / Corrections   
3           3  2016_082054  carimbo  Wages  Salary Adjustments / Corrections   
4           4  2016_082054  carimbo  Wages                    Salary Payment   

                                                text  \
0  - SALÁRIOS NORMATIVOS  Fica assegurado, para o...   
1   Os salários dos empregados (as) serão corrigi...   
2  COMPENSAÇÕES  Serão antes COMPENSADOS DA APLIC...   
3  - ADMISSÃO APÓS DATA-BASE  O reajuste salarial...   
4  - PAGAMENTO DE SALÁRIOS  A) A empresa deverá p...   

                                          clean_text  carimbo  semvalorlegal  
0  salários normativos ficar assegurar empregado ...        1              0  
1  salário empregado ser

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').