TODO: Fix for percentages

In [5]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [6]:
# sets the output directory
cba_path = os.path.join(".", "output")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = '/Users/calvineng/Dropbox/Calvin_Eng/cba_text_analysis/cba_txt_2009'

In [7]:
clause_groups = pd.read_csv('clause_groups.csv', index_col='Clause Group')
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_all(file_path):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  
        
        # retrieves the title
        title_start_flag = dropwhile(lambda line: '<STARTofTITLE>' not in line, lines)
        next(title_start_flag,"")
        title_end_flag = takewhile(lambda line: '<ENDofTITLE>' not in line, title_start_flag)
        title = '\n'.join(title_end_flag).strip()

        # retrieves the validity
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = '\n'.join(validity_end_flag).strip()

        # extract the number of clauses by theme
        flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(flag_start,"")
        flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, flag_start)
        clause_counts = 0
        theme_counts = {theme: 0 for theme in themes}
        for line in flag_end:
            if not line: 
                continue  
            key = line.split('|')[0]
            if key not in theme_dict:
                continue
            theme = theme_dict[key]
            if theme in themes:
                theme_counts[theme] += 1
            clause_counts += 1

        return [title, validity, clause_counts] +  list(theme_counts.values())

def output_all(file_path_x, files_x):
    # only consider files with start dates 2008-2017
    #if files_x[0:4]=='2008':
    if files_x[0:4]=='2008' or files_x[0:4]=='2009' or files_x[0:4]=='2010' or files_x[0:4]=='2011' or files_x[0:4]=='2012' \
    or files_x[0:4]=='2013' or files_x[0:4]=='2014' or files_x[0:4]=='2015' or files_x[0:4]=='2016' or files_x[0:4]=='2017':
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4])!=11:
            pass
        text = extract_all(os.path.join(file_path_x, files_x))
        output = contract_id + text
        pair_line = ('|').join(str(x) for x in output)
        # save info for contract as a single new line
        with io.open(path_txt,'a',encoding='utf8') as f:
            f.write(pair_line + "\n")

In [8]:
# rewrites output file
path_txt = os.path.join(cba_path, "contract_data.txt")
with io.open(path_txt,'w',encoding='utf8') as f:
    header = 'contract_id|title|validity|clause_count|' + '|'.join(themes)
    f.write(header + '\n')
    
# loops over each contract
for idx, files in enumerate(os.listdir(file_path)):
    if idx % 1000 == 0:
        print("Looping through file ", files)
    output_all(file_path, files)

Looping through file  2009_11_01__2010_033261.txt
Looping through file  2009_06_01__2010_009417.txt
Looping through file  2009_10_01__2009_057913.txt
Looping through file  2009_09_01__2010_055133.txt
Looping through file  2009_11_01__2009_061078.txt
Looping through file  2009_03_20__2009_040361.txt
Looping through file  2009_04_01__2009_030683.txt
Looping through file  2009_05_01__2009_040365.txt
Looping through file  2009_04_01__2009_014862.txt
Looping through file  2009_04_01__2009_010972.txt
Looping through file  2009_01_01__2009_052824.txt
Looping through file  2009_05_01__2009_023162.txt
Looping through file  2009_08_01__2009_064476.txt
Looping through file  2009_12_18__2010_051882.txt
Looping through file  2009_02_06__2009_003866.txt
Looping through file  2009_05_01__2009_018184.txt
Looping through file  2009_09_22__2009_046008.txt
Looping through file  2009_06_01__2009_031139.txt
Looping through file  2009_04_01__2009_043786.txt
Looping through file  2009_03_24__2009_031070.txt


In [9]:
df = pd.read_csv('output/contract_data.txt', sep='|')
validity_dummies = pd.get_dummies(df['validity'])
df = pd.concat([df, validity_dummies], axis=1)
df = df[df['title'].str.contains('Extrato Acordo')]
df.head()



Unnamed: 0,contract_id,title,validity,clause_count,Wages,Health,Union,Safety / Injury / Disability,Work Adaptation / Training,Work Time,...,Retirement,Work Environment / Harassment,Family,Dismissals / Transfers,Fees,Staffing / Hiring / Outsourcing,Other,Equality / Fairness,carimbo,semvalorlegal
0,2010_033261,Mediador - Extrato Acordo Coletivo,semvalorlegal,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2009_055971,Mediador - Extrato Acordo Coletivo,carimbo,13,3,1,1,1,0,3,...,0,0,0,0,0,0,2,0,1,0
3,2010_000037,Mediador - Extrato Acordo Coletivo,semvalorlegal,58,11,4,5,5,3,10,...,1,1,3,1,0,7,2,0,0,1
4,2009_061635,Mediador - Extrato Acordo Coletivo,carimbo,12,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,1,0
6,2010_031954,Mediador - Extrato Acordo Coletivo,carimbo,13,2,0,3,0,0,3,...,0,0,1,0,0,0,0,0,1,0


In [13]:
# columns to be used for analysis
X = df[['clause_count']]
y = df[['carimbo']]

# splits the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=253)

# performs OLS regression and summarizes regression
olsReg = sm.OLS(y_train,X_train).fit()
print(olsReg.summary())

# uses regression to predict with test data
y_pred = olsReg.predict(X_test)

# compares the predicted values to the actual values to evaluate model performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R-squared:', r2)
print('Mean squared error:', mse)

                                 OLS Regression Results                                
Dep. Variable:                carimbo   R-squared (uncentered):                -238.367
Model:                            OLS   Adj. R-squared (uncentered):           -238.373
Method:                 Least Squares   F-statistic:                         -3.760e+04
Date:                Tue, 18 Apr 2023   Prob (F-statistic):                        1.00
Time:                        10:48:32   Log-Likelihood:                         -38236.
No. Observations:               37755   AIC:                                  7.647e+04
Df Residuals:                   37754   BIC:                                  7.648e+04
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------

In [16]:
from sklearn.discriminant_analysis import StandardScaler

# columns to be used for analysis
X = df[['clause_count', 'Health', 'Union', 'Safety / Injury / Disability', 'Work Adaptation / Training', 'Work Time', 'Incentives', 'Food / Education / Housing', 'Contract Agreement', 'Retirement', 'Work Environment / Harassment', 'Family', 'Dismissals / Transfers', 'Fees', 'Staffing / Hiring / Outsourcing', 'Equality / Fairness']]
y = df[['carimbo']]

# normalizes the predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# splits the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20, random_state=253)

# performs OLS regression and summarizes regression
olsReg = sm.OLS(y_train,X_train).fit()
print(olsReg.summary())

# uses regression to predict with test data
y_pred = olsReg.predict(X_test)

# compares the predicted values to the actual values to evaluate model performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R-squared:', r2)
print('Mean squared error:', mse)

                                 OLS Regression Results                                
Dep. Variable:                carimbo   R-squared (uncentered):                -357.523
Model:                            OLS   Adj. R-squared (uncentered):           -357.675
Method:                 Least Squares   F-statistic:                             -2352.
Date:                Tue, 18 Apr 2023   Prob (F-statistic):                        1.00
Time:                        10:50:18   Log-Likelihood:                         -45863.
No. Observations:               37755   AIC:                                  9.176e+04
Df Residuals:                   37739   BIC:                                  9.189e+04
Df Model:                          16                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------