In [141]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [142]:
# sets the output directory
cba_path = os.path.join(".", "output")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = os.getcwd() + '/cbas'

In [1]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_all(file_path):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  
        
        # retrieves the title
        title_start_flag = dropwhile(lambda line: '<STARTofTITLE>' not in line, lines)
        next(title_start_flag,"")
        title_end_flag = takewhile(lambda line: '<ENDofTITLE>' not in line, title_start_flag)
        title = '\n'.join(title_end_flag).strip()

        # retrieves the validity
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = '\n'.join(validity_end_flag).strip()

        # extract the number of clauses by theme
        flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(flag_start,"")
        flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, flag_start)
        clause_counts = 0
        theme_counts = {theme: 0 for theme in themes}
        for line in flag_end:
            if not line: 
                continue  
            key = line.split('|')[0]
            theme = theme_dict[key]
            if theme in themes:
                theme_counts[theme] += 1
            clause_counts += 1

        return [title, validity, clause_counts] +  list(theme_counts.values())

def output_all(file_path_x, files_x):
    # only consider files with start dates 2008-2017
    #if files_x[0:4]=='2008':
    if files_x[0:4]=='2008' or files_x[0:4]=='2009' or files_x[0:4]=='2010' or files_x[0:4]=='2011' or files_x[0:4]=='2012' \
    or files_x[0:4]=='2013' or files_x[0:4]=='2014' or files_x[0:4]=='2015' or files_x[0:4]=='2016' or files_x[0:4]=='2017':
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4])!=11:
            pass
        text = extract_all(os.path.join(file_path_x, files_x))
        output = contract_id + text
        pair_line = ('|').join(str(x) for x in output)
        # save info for contract as a single new line
        with io.open(path_txt,'a',encoding='utf8') as f:
            f.write(pair_line + "\n")

NameError: name 'pd' is not defined

In [144]:
# rewrites output file
path_txt = os.path.join(cba_path, "contract_data.txt")
with io.open(path_txt,'w',encoding='utf8') as f:
    header = 'contract_id|title|validity|clause_count|' + '|'.join(themes)
    f.write(header + '\n')
    
# loops over each contract
for idx, files in enumerate(os.listdir(file_path)):
    print("Looping through file ", files)
    output_all(file_path, files)

Looping through file  2016_09_01__2016_082054.txt
Looping through file  2011_11_01__2012_002993.txt
Looping through file  2014_01_01__2014_081501.txt
Looping through file  2017_12_01__2017_084835.txt
Looping through file  2017_12_01__2017_084809.txt
Looping through file  2013_11_15__2013_055346.txt
Looping through file  2009_01_01__2009_016497.txt
Looping through file  2015_06_16__2015_060659.txt
Looping through file  2018_05_01__2018_044118.txt
Looping through file  2012_05_01__2012_042451.txt
Looping through file  2011_11_01__2012_002943.txt
Looping through file  2016_09_01__2016_082084.txt
Looping through file  2013_11_14__2014_009174.txt
Looping through file  2009_01_01__2009_016731.txt
Looping through file  2015_05_01__2015_043073.txt
Looping through file  2011_11_01__2012_003082.txt
Looping through file  2015_12_16__2015_084042.txt
Looping through file  2017_12_01__2017_084934.txt
Looping through file  2013_06_01__2013_073146.txt
Looping through file  2017_03_01__2017_039221.txt


In [145]:
df = pd.read_csv('output/contract_data.txt', sep='|')
validity_dummies = pd.get_dummies(df['validity'])
df = pd.concat([df, validity_dummies], axis=1)
df.head()



Unnamed: 0,contract_id,title,validity,clause_count,Wages,Health,Union,Safety / Injury / Disability,Work Adaptation / Training,Work Time,...,Retirement,Work Environment / Harassment,Family,Dismissals / Transfers,Fees,Staffing / Hiring / Outsourcing,Other,Equality / Fairness,carimbo,semvalorlegal
0,2016_082054,Mediador - Extrato Acordo Coletivo,carimbo,69,11,2,6,8,4,9,...,2,1,6,4,0,5,4,2,1,0
1,2012_002993,Mediador - Extrato Acordo Coletivo,semvalorlegal,50,6,2,7,5,1,6,...,1,2,4,2,0,1,9,0,0,1
2,2014_081501,Mediador - Extrato Acordo Coletivo,semvalorlegal,4,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,1
3,2017_084835,Mediador - Extrato Acordo Coletivo,carimbo,10,0,0,0,0,0,0,...,0,0,0,0,0,0,10,0,1,0
4,2017_084809,Mediador - Extrato Acordo Coletivo,carimbo,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [146]:
# columns to be used for analysis
X = df[['clause_count']]
y = df[['carimbo']]

# # normalizes the predictor variables
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# splits the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# performs OLS regression and summarizes regression
olsReg = sm.OLS(y_train,X_train).fit()
print(olsReg.summary())

# uses regression to predict with test data
y_pred = olsReg.predict(X_test)

# compares the predicted values to the actual values to evaluate model performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R-squared:', r2)
print('Mean squared error:', mse)

                                 OLS Regression Results                                
Dep. Variable:                carimbo   R-squared (uncentered):                   0.287
Model:                            OLS   Adj. R-squared (uncentered):              0.268
Method:                 Least Squares   F-statistic:                              14.89
Date:                Thu, 06 Apr 2023   Prob (F-statistic):                    0.000440
Time:                        22:38:16   Log-Likelihood:                         -36.224
No. Observations:                  38   AIC:                                      74.45
Df Residuals:                      37   BIC:                                      76.09
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------