In [59]:
import pandas as pd
import os
import io
from itertools import dropwhile, takewhile
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [60]:
# sets the output directory
cba_path = os.path.join(".", "clause_data")
if not os.path.isdir(cba_path):
    os.mkdir(cba_path)

# sets the input directory
file_path = '/Users/calvineng/Dropbox/Calvin_Eng/cba_text_analysis/cba_txt_2009'

In [61]:
clause_groups = pd.read_csv('clause_groups/clause_groups_OLD_1.csv', index_col='Clause Group')
themes = list(map(str, clause_groups['Theme'].unique()))
theme_dict = clause_groups['Theme'].to_dict()

def extract_all(file_path):
    with io.open(file_path, 'r') as f:
        # removes white space from the ends of lines
        lines = (line.strip() for line in f)  
        
        # retrieves the title
        title_start_flag = dropwhile(lambda line: '<STARTofTITLE>' not in line, lines)
        next(title_start_flag,"")
        title_end_flag = takewhile(lambda line: '<ENDofTITLE>' not in line, title_start_flag)
        title = '\n'.join(title_end_flag).strip()

        # retrieves the validity
        validity_start_flag = dropwhile(lambda line: '<STARTofVALIDITY>' not in line, lines)
        next(validity_start_flag,"")
        validity_end_flag = takewhile(lambda line: '<ENDofVALIDITY>' not in line, validity_start_flag)
        validity = '\n'.join(validity_end_flag).strip()

        # extract the number of clauses by theme
        flag_start = dropwhile(lambda line: '<STARTofCLAUSES>' not in line, lines)
        next(flag_start,"")
        flag_end = takewhile(lambda line: '<ENDofCLAUSES>' not in line, flag_start)
        clause_counts = 0
        theme_counts = {theme: 0 for theme in themes}
        for line in flag_end:
            if not line: 
                continue  
            key = line.split('|')[0]
            if key not in theme_dict:
                continue
            theme = theme_dict[key]
            if theme in themes:
                theme_counts[theme] += 1
            clause_counts += 1

        return [title, validity, clause_counts] +  list(theme_counts.values())

def output_all(file_path_x, files_x):
    # only consider files with start dates 2008-2017
    #if files_x[0:4]=='2008':
    if files_x[0:4]=='2008' or files_x[0:4]=='2009' or files_x[0:4]=='2010' or files_x[0:4]=='2011' or files_x[0:4]=='2012' \
    or files_x[0:4]=='2013' or files_x[0:4]=='2014' or files_x[0:4]=='2015' or files_x[0:4]=='2016' or files_x[0:4]=='2017':
        # contract identifier
        contract_id = [files_x[-15:-4]]
        if len(files_x[-15:-4])!=11:
            pass
        text = extract_all(os.path.join(file_path_x, files_x))
        output = contract_id + text
        pair_line = ('|').join(str(x) for x in output)
        # save info for contract as a single new line
        with io.open(path_txt,'a',encoding='utf8') as f:
            f.write(pair_line + "\n")

In [62]:
# rewrites output file
path_txt = os.path.join(cba_path, "clause_counts.csv")
with io.open(path_txt,'w',encoding='utf8') as f:
    header = 'contract_id|title|validity|Clause Count|' + '|'.join(themes)
    f.write(header + '\n')
    
# loops over each contract
for idx, files in enumerate(os.listdir(file_path)):
    if idx % 1000 == 0:
        print("Looping through file ", files)
    output_all(file_path, files)

Looping through file  2009_11_01__2010_033261.txt
Looping through file  2009_06_01__2010_009417.txt
Looping through file  2009_10_01__2009_057913.txt
Looping through file  2009_09_01__2010_055133.txt
Looping through file  2009_11_01__2009_061078.txt
Looping through file  2009_03_20__2009_040361.txt
Looping through file  2009_04_01__2009_030683.txt
Looping through file  2009_05_01__2009_040365.txt
Looping through file  2009_04_01__2009_014862.txt
Looping through file  2009_04_01__2009_010972.txt
Looping through file  2009_01_01__2009_052824.txt
Looping through file  2009_05_01__2009_023162.txt
Looping through file  2009_08_01__2009_064476.txt
Looping through file  2009_12_18__2010_051882.txt
Looping through file  2009_02_06__2009_003866.txt
Looping through file  2009_05_01__2009_018184.txt
Looping through file  2009_09_22__2009_046008.txt
Looping through file  2009_06_01__2009_031139.txt
Looping through file  2009_04_01__2009_043786.txt
Looping through file  2009_03_24__2009_031070.txt


In [63]:
# reads in the data
df = pd.read_csv('clause_data/clause_counts.csv', sep='|')

# filters the data
df = df[df['title'].str.contains('Extrato Acordo Coletivo')]
df = df.loc[(df['Clause Count'] > 2)]

# calculates percentage of clause count for each theme
for theme in themes:
    df[theme + ' Percent'] = df[theme] / df['Clause Count']

# creates dummy variables for the 'validity' column
validity_dummies = pd.get_dummies(df['validity'])
df = pd.concat([df, validity_dummies], axis=1)

# reindexes the dataframe with the default integer index
df = df.reset_index(drop=True)

validity_counts = df['validity'].value_counts()
print(validity_counts)
print()

print(f'There are {len(df.index)} documents in the dataframe.')
df.head(10)

carimbo          26838
semvalorlegal    11563
Name: validity, dtype: int64

There are 38401 documents in the dataframe.


Unnamed: 0,contract_id,title,validity,Clause Count,Wages,Health,Union,Safety / Injury / Disability,Work Adaptation / Training,Work Time,...,Retirement Percent,Work Environment / Harassment Percent,Family Percent,Dismissals / Transfers Percent,Fees Percent,Staffing / Hiring / Outsourcing Percent,Other Percent,Equality / Fairness Percent,carimbo,semvalorlegal
0,2009_055971,Mediador - Extrato Acordo Coletivo,carimbo,13,3,1,1,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,1,0
1,2010_000037,Mediador - Extrato Acordo Coletivo,semvalorlegal,58,11,4,5,5,3,10,...,0.017241,0.017241,0.051724,0.017241,0.0,0.12069,0.034483,0.0,0,1
2,2009_061635,Mediador - Extrato Acordo Coletivo,carimbo,12,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0
3,2010_031954,Mediador - Extrato Acordo Coletivo,carimbo,13,2,0,3,0,0,3,...,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,1,0
4,2009_032012,Mediador - Extrato Acordo Coletivo,carimbo,7,0,0,0,0,0,3,...,0.0,0.0,0.0,0.0,0.0,0.142857,0.285714,0.0,1,0
5,2010_008949,Mediador - Extrato Acordo Coletivo,semvalorlegal,35,5,1,8,2,1,6,...,0.0,0.0,0.142857,0.0,0.0,0.0,0.057143,0.0,0,1
6,2009_058078,Mediador - Extrato Acordo Coletivo,semvalorlegal,8,8,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
7,2010_008791,Mediador - Extrato Acordo Coletivo,carimbo,33,5,1,8,1,1,6,...,0.0,0.030303,0.121212,0.0,0.0,0.0,0.060606,0.0,1,0
8,2009_029474,Mediador - Extrato Acordo Coletivo,carimbo,81,9,3,3,7,0,13,...,0.0,0.012346,0.024691,0.049383,0.0,0.012346,0.444444,0.0,1,0
9,2009_032829,Mediador - Extrato Acordo Coletivo,carimbo,8,1,0,0,0,0,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


In [64]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# columns to be used for analysis
X = df[['Clause Count']]
X = sm.add_constant(X)
y = df[['carimbo']]

# splits the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=253)

# performs logistic regression and summarizes regression
logReg = sm.Logit(y_train, X_train).fit()
print(logReg.summary())

# uses regression to predict with test data
y_pred = logReg.predict(X_test)
y_pred_class = [1 if p >= 0.5 else 0 for p in y_pred]

# compares the predicted values to actual values
accuracy = accuracy_score(y_test, y_pred_class)
print('Accuracy:', accuracy)

Optimization terminated successfully.
         Current function value: 0.250529
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                carimbo   No. Observations:                28800
Model:                          Logit   Df Residuals:                    28798
Method:                           MLE   Df Model:                            1
Date:                Fri, 28 Apr 2023   Pseudo R-squ.:                     inf
Time:                        12:36:47   Log-Likelihood:                -7215.2
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.9218      0.020     46.805      0.000       0.883       0.960
Clause Count    -0.0036

  return 1 - self.llf/self.llnull


In [65]:
from sklearn.preprocessing import MinMaxScaler

# columns to be used for analysis (removed 'Wages Percent' and 'Other Percent')
X = df[['Clause Count', 'Health Percent', 'Union Percent', 'Safety / Injury / Disability Percent',
       'Work Adaptation / Training Percent', 'Work Time Percent', 'Incentives Percent', 'Food / Education / Housing Percent',
       'Contract Agreement Percent', 'Retirement Percent', 'Work Environment / Harassment Percent', 'Family Percent',
       'Dismissals / Transfers Percent', 'Fees Percent', 'Staffing / Hiring / Outsourcing Percent', 'Equality / Fairness Percent']]
y = df['carimbo']

# normalizes the predictor variables
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# splits the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state=253)

# performs logistic regression and summarizes regression
logitReg = sm.Logit(y_train, X_train).fit()
print(logitReg.summary())

# uses regression to predict with test data
y_pred = logitReg.predict(X_test)
y_pred_class = [1 if p >= 0.5 else 0 for p in y_pred]

# compares the predicted values to actual values
accuracy = accuracy_score(y_test, y_pred_class)
print('Accuracy:', accuracy)

Optimization terminated successfully.
         Current function value: 0.281074
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                carimbo   No. Observations:                28800
Model:                          Logit   Df Residuals:                    28784
Method:                           MLE   Df Model:                           15
Date:                Fri, 28 Apr 2023   Pseudo R-squ.:                     inf
Time:                        12:36:47   Log-Likelihood:                -8094.9
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                              coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------
Clause Count                                0.3659

  return 1 - self.llf/self.llnull


In [66]:
# reads in the data
df = pd.read_csv('clause_data/clause_counts.csv', sep='|')

# filters the data
df = df[df['title'].str.contains('Extrato Acordo Coletivo')]
df = df.loc[(df['Clause Count'] > 2)]

# calculates percentage of clause count for each theme
for theme in themes:
    df[theme + ' Percent'] = df[theme] / df['Clause Count']

# creates dummy variables for the 'validity' column
validity_dummies = pd.get_dummies(df['validity'])
df = pd.concat([df, validity_dummies], axis=1)

# create two separate dataframes for rows with 'validity' as 'carimbo' and 'semvalorlegal'
df_carimbo = df[df['validity'] == 'carimbo']
df_semvalorlegal = df[df['validity'] == 'semvalorlegal']

# find the length of each dataframe
len_carimbo = len(df_carimbo)
len_semvalorlegal = len(df_semvalorlegal)

# if the length of the two dataframes is not equal, randomly drop rows from the larger dataframe until they have the same length
if len_carimbo > len_semvalorlegal:
    df_carimbo = df_carimbo.sample(n=len_semvalorlegal, random_state=253)
else:
    df_semvalorlegal = df_semvalorlegal.sample(n=len_carimbo, random_state=253)

# concatenate the two dataframes
df = pd.concat([df_carimbo, df_semvalorlegal])

# reindexes the dataframe with the default integer index
df = df.reset_index(drop=True)

validity_counts = df['validity'].value_counts()
print(validity_counts)
print()

print(f'There are {len(df.index)} documents in the dataframe.')
df.head()

carimbo          11563
semvalorlegal    11563
Name: validity, dtype: int64

There are 23126 documents in the dataframe.


Unnamed: 0,contract_id,title,validity,Clause Count,Wages,Health,Union,Safety / Injury / Disability,Work Adaptation / Training,Work Time,...,Retirement Percent,Work Environment / Harassment Percent,Family Percent,Dismissals / Transfers Percent,Fees Percent,Staffing / Hiring / Outsourcing Percent,Other Percent,Equality / Fairness Percent,carimbo,semvalorlegal
0,2009_012553,Mediador - Extrato Acordo Coletivo,carimbo,13,3,0,0,0,0,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
1,2009_036326,Mediador - Extrato Acordo Coletivo,carimbo,36,6,1,4,3,1,3,...,0.055556,0.027778,0.055556,0.111111,0.0,0.083333,0.055556,0.0,1,0
2,2009_058102,Mediador - Extrato Acordo Coletivo,carimbo,5,5,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
3,2009_026066,Mediador - Extrato Acordo Coletivo,carimbo,5,2,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
4,2009_012655,Mediador - Extrato Acordo Coletivo,carimbo,4,0,0,0,0,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


In [67]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# columns to be used for analysis
X = df[['Clause Count']]
X = sm.add_constant(X)
y = df[['carimbo']]

# splits the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=253)

# performs logistic regression and summarizes regression
logReg = sm.Logit(y_train, X_train).fit_regularized()
print(logReg.summary())

# uses regression to predict with test data
y_pred = logReg.predict(X_test)
y_pred_class = [1 if p >= 0.5 else 0 for p in y_pred]

# compares the predicted values to actual values
accuracy = accuracy_score(y_test, y_pred_class)
print('Accuracy:', accuracy)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.6931472105643791
            Iterations: 1
            Function evaluations: 12
            Gradient evaluations: 1
                           Logit Regression Results                           
Dep. Variable:                carimbo   No. Observations:                17344
Model:                          Logit   Df Residuals:                    17342
Method:                           MLE   Df Model:                            1
Date:                Fri, 28 Apr 2023   Pseudo R-squ.:                     inf
Time:                        12:36:47   Log-Likelihood:                -12022.
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                   coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
  return 1 - self.llf/self.llnull


In [68]:
from sklearn.preprocessing import MinMaxScaler

# columns to be used for analysis (removed 'Wages Percent' and 'Other Percent')
X = df[['Clause Count', 'Health Percent', 'Union Percent', 'Safety / Injury / Disability Percent',
       'Work Adaptation / Training Percent', 'Work Time Percent', 'Incentives Percent', 'Food / Education / Housing Percent',
       'Contract Agreement Percent', 'Retirement Percent', 'Work Environment / Harassment Percent', 'Family Percent',
       'Dismissals / Transfers Percent', 'Fees Percent', 'Staffing / Hiring / Outsourcing Percent', 'Equality / Fairness Percent']]
y = df['carimbo']

# normalizes the predictor variables
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# splits the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state=253)

# performs logistic regression and summarizes regression
logitReg = sm.Logit(y_train, X_train).fit_regularized()
print(logitReg.summary())

# uses regression to predict with test data
y_pred = logitReg.predict(X_test)
y_pred_class = [1 if p >= 0.5 else 0 for p in y_pred]

# compares the predicted values to actual values
accuracy = accuracy_score(y_test, y_pred_class)
print('Accuracy:', accuracy)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.6931471805599766
            Iterations: 1
            Function evaluations: 12
            Gradient evaluations: 1
                           Logit Regression Results                           
Dep. Variable:                carimbo   No. Observations:                17344
Model:                          Logit   Df Residuals:                    17328
Method:                           MLE   Df Model:                           15
Date:                Fri, 28 Apr 2023   Pseudo R-squ.:                     inf
Time:                        12:36:47   Log-Likelihood:                -12022.
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                              coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------

  return 1 - self.llf/self.llnull
