## Load Libraries

In [1]:
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm

warnings.filterwarnings('ignore')

## Import Cleaned Investigation Data

In [2]:
inv_df = pd.read_csv('Cleaned Investigation Data.csv')

In [3]:
# Check
rows = len(inv_df)
print(f"There are {rows} observations")

There are 1116 observations


## Create Exogenous and Engodenous variables

### 1. Exogenous (Features)

In [4]:
X = inv_df[[
    # General
    'investigation_duration', 
    'time_between_deferral_start_and_investigation', 
    
    # Number of (critical) actions & completed (critical) actions
    'number_of_actions', 
    'actions_completed_count',
    'critical_actions_count', 
    'critical_actions_completed_count',
    
    # Cause type
#     '5whys_cause_type_count',
    'systemic_cause_identified', 
    'physical_cause_identified', 
#     'human_cause_identified',
    
    # Volume
    'deferral_volume_net',
    
    # Years - Exclude 2013
#     'year_2014', 'year_2015', 
#     'year_2016', 'year_2017', 'year_2018', 
    'year_2019', 'year_2020', 'year_2021', 
    'year_2022', 'year_2023',
    
    # Region - Drop 'APAC'
    'AGT','GOMC', 
    'ME', 'NS', 'TT',
    
    # Choke - Drop 'Export'
    'Plant Facility', 'Reservoir', 'Well',
    
    # Investigation Type
    '5 Whys','Cause Map',
]]

In [5]:
X = sm.add_constant(X)

### 2. Endogenous (Target)

### 1. Binarize recurrent counts

In [6]:
columns_to_binarize = ['recurrent_count_1_365', 'recurrent_count_90_365', 'recurrent_count_180_365',
                       'recurrent_count_1_730', 'recurrent_count_90_730', 'recurrent_count_180_730']
inv_df[columns_to_binarize] = (inv_df[columns_to_binarize] > 0).astype(int)

### 2. Define variables for each target

In [7]:
y_1_365 = inv_df['recurrent_count_1_365']
y_90_365 = inv_df['recurrent_count_90_365']
y_180_365 = inv_df['recurrent_count_180_365']
y_1_730 = inv_df['recurrent_count_1_730']
y_90_730 = inv_df['recurrent_count_90_730']
y_180_730 = inv_df['recurrent_count_180_730']

# Regression Models

## 1-365 days

### 1. Log-Odds

In [8]:
model = sm.Logit(y_1_365, X)
result = model.fit(maxiter=1000)

Optimization terminated successfully.
         Current function value: 0.651718
         Iterations 7


In [9]:
summary_table = result.summary()
summary_table

0,1,2,3
Dep. Variable:,recurrent_count_1_365,No. Observations:,1116.0
Model:,Logit,Df Residuals:,1091.0
Method:,MLE,Df Model:,24.0
Date:,"Mon, 07 Aug 2023",Pseudo R-squ.:,0.05406
Time:,15:23:39,Log-Likelihood:,-727.32
converged:,True,LL-Null:,-768.88
Covariance Type:,nonrobust,LLR p-value:,1.913e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6427,0.702,-0.915,0.360,-2.019,0.733
investigation_duration,-0.0017,0.001,-2.943,0.003,-0.003,-0.001
time_between_deferral_start_and_investigation,-0.0016,0.001,-3.141,0.002,-0.003,-0.001
number_of_actions,0.1546,0.108,1.430,0.153,-0.057,0.367
actions_completed_count,-0.1914,0.113,-1.700,0.089,-0.412,0.029
critical_actions_count,-0.1784,0.139,-1.287,0.198,-0.450,0.093
critical_actions_completed_count,-0.2503,0.611,-0.409,0.682,-1.449,0.948
systemic_cause_identified,-2.2096,1.290,-1.713,0.087,-4.737,0.318
physical_cause_identified,2.1612,1.213,1.782,0.075,-0.216,4.539


### 2. Probabilities - at $\mu$

In [10]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='mean')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_1_365
Method:,dydx
At:,mean

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0004,0.0,-2.946,0.003,-0.001,-0.0
time_between_deferral_start_and_investigation,-0.0004,0.0,-3.172,0.002,-0.001,-0.0
number_of_actions,0.038,0.027,1.429,0.153,-0.014,0.09
actions_completed_count,-0.0471,0.028,-1.699,0.089,-0.101,0.007
critical_actions_count,-0.0439,0.034,-1.287,0.198,-0.111,0.023
critical_actions_completed_count,-0.0616,0.15,-0.409,0.682,-0.356,0.233
systemic_cause_identified,-0.5436,0.317,-1.713,0.087,-1.165,0.078
physical_cause_identified,0.5317,0.298,1.781,0.075,-0.053,1.117
deferral_volume_net,-0.0002,0.0,-1.143,0.253,-0.0,0.0
year_2019,0.0475,0.045,1.045,0.296,-0.042,0.136


### 3. Probabilities - at 0

In [11]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='zero')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_1_365
Method:,dydx
At:,zero

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0004,0.0,-2.402,0.016,-0.001,-6.96e-05
time_between_deferral_start_and_investigation,-0.0004,0.0,-2.636,0.008,-0.001,-9.13e-05
number_of_actions,0.0349,0.025,1.384,0.166,-0.015,0.084
actions_completed_count,-0.0432,0.027,-1.604,0.109,-0.096,0.01
critical_actions_count,-0.0403,0.032,-1.254,0.21,-0.103,0.023
critical_actions_completed_count,-0.0565,0.139,-0.408,0.683,-0.328,0.215
systemic_cause_identified,-0.4991,0.312,-1.6,0.11,-1.111,0.112
physical_cause_identified,0.4881,0.297,1.644,0.1,-0.094,1.07
deferral_volume_net,-0.0001,0.0,-1.076,0.282,-0.0,0.0
year_2019,0.0436,0.042,1.029,0.304,-0.039,0.127


## 90-365 days

### 1. Log-Odds

In [12]:
model = sm.Logit(y_90_365, X)
result = model.fit(maxiter=1000)

Optimization terminated successfully.
         Current function value: 0.582402
         Iterations 8


In [13]:
summary_table = result.summary()
summary_table

0,1,2,3
Dep. Variable:,recurrent_count_90_365,No. Observations:,1116.0
Model:,Logit,Df Residuals:,1091.0
Method:,MLE,Df Model:,24.0
Date:,"Mon, 07 Aug 2023",Pseudo R-squ.:,0.04918
Time:,15:23:39,Log-Likelihood:,-649.96
converged:,True,LL-Null:,-683.58
Covariance Type:,nonrobust,LLR p-value:,5.69e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.8720,0.748,-1.166,0.244,-2.338,0.594
investigation_duration,-0.0011,0.001,-1.770,0.077,-0.002,0.000
time_between_deferral_start_and_investigation,-0.0026,0.001,-3.283,0.001,-0.004,-0.001
number_of_actions,-0.0275,0.111,-0.247,0.805,-0.246,0.191
actions_completed_count,-0.0370,0.117,-0.316,0.752,-0.266,0.192
critical_actions_count,0.0400,0.142,0.281,0.779,-0.239,0.319
critical_actions_completed_count,-1.1877,0.951,-1.249,0.212,-3.052,0.677
systemic_cause_identified,-1.5724,1.275,-1.233,0.217,-4.071,0.927
physical_cause_identified,1.1832,1.113,1.063,0.288,-0.998,3.364


### 2. Probabilities - at $\mu$

In [14]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='mean')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_90_365
Method:,dydx
At:,mean

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0002,0.0,-1.773,0.076,-0.0,2.22e-05
time_between_deferral_start_and_investigation,-0.0005,0.0,-3.543,0.0,-0.001,-0.0
number_of_actions,-0.0054,0.022,-0.247,0.805,-0.048,0.037
actions_completed_count,-0.0072,0.023,-0.316,0.752,-0.052,0.037
critical_actions_count,0.0078,0.028,0.281,0.779,-0.047,0.062
critical_actions_completed_count,-0.2313,0.184,-1.258,0.208,-0.592,0.129
systemic_cause_identified,-0.3062,0.248,-1.237,0.216,-0.791,0.179
physical_cause_identified,0.2304,0.216,1.065,0.287,-0.193,0.654
deferral_volume_net,-0.0001,0.0,-0.928,0.353,-0.0,0.0
year_2019,0.0687,0.038,1.826,0.068,-0.005,0.142


### 3. Probabilities - at 0

In [15]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='zero')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_90_365
Method:,dydx
At:,zero

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0002,0.0,-1.509,0.131,-0.001,6.74e-05
time_between_deferral_start_and_investigation,-0.0006,0.0,-2.34,0.019,-0.001,-8.95e-05
number_of_actions,-0.0057,0.023,-0.245,0.806,-0.051,0.04
actions_completed_count,-0.0077,0.024,-0.315,0.752,-0.056,0.04
critical_actions_count,0.0083,0.03,0.279,0.781,-0.05,0.067
critical_actions_completed_count,-0.2469,0.212,-1.167,0.243,-0.662,0.168
systemic_cause_identified,-0.3269,0.285,-1.149,0.251,-0.885,0.231
physical_cause_identified,0.246,0.246,0.999,0.318,-0.236,0.728
deferral_volume_net,-0.0001,0.0,-0.863,0.388,-0.0,0.0
year_2019,0.0733,0.045,1.62,0.105,-0.015,0.162


# 180-365 days

### 1. Log-Odds

In [16]:
model = sm.Logit(y_180_365, X)
result = model.fit(maxiter=1000)

         Current function value: 0.461947
         Iterations: 1000


In [17]:
summary_table = result.summary()
summary_table

0,1,2,3
Dep. Variable:,recurrent_count_180_365,No. Observations:,1116.0
Model:,Logit,Df Residuals:,1091.0
Method:,MLE,Df Model:,24.0
Date:,"Mon, 07 Aug 2023",Pseudo R-squ.:,0.06223
Time:,15:23:39,Log-Likelihood:,-515.53
converged:,False,LL-Null:,-549.75
Covariance Type:,nonrobust,LLR p-value:,3.777e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.9019,0.947,-2.009,0.045,-3.758,-0.046
investigation_duration,-0.0011,0.001,-1.457,0.145,-0.003,0.000
time_between_deferral_start_and_investigation,-0.0015,0.001,-1.899,0.058,-0.003,4.73e-05
number_of_actions,0.0343,0.128,0.269,0.788,-0.216,0.284
actions_completed_count,-0.0766,0.134,-0.573,0.567,-0.339,0.185
critical_actions_count,0.2344,0.244,0.961,0.337,-0.244,0.713
critical_actions_completed_count,-0.9332,1.235,-0.755,0.450,-3.354,1.488
systemic_cause_identified,-33.6394,1.02e+07,-3.29e-06,1.000,-2e+07,2e+07
physical_cause_identified,0.6694,1.323,0.506,0.613,-1.924,3.263


### 2. Probabilities - at $\mu$

In [18]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='mean')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_180_365
Method:,dydx
At:,mean

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-4.453e-05,9.809,-4.54e-06,1.0,-19.225,19.225
time_between_deferral_start_and_investigation,-5.986e-05,13.188,-4.54e-06,1.0,-25.847,25.847
number_of_actions,0.0014,307.257,4.54e-06,1.0,-602.211,602.214
actions_completed_count,-0.0031,687.106,-4.54e-06,1.0,-1346.707,1346.701
critical_actions_count,0.0095,2103.003,4.54e-06,1.0,-4121.8,4121.82
critical_actions_completed_count,-0.038,8370.632,-4.54e-06,1.0,-16400.0,16400.0
systemic_cause_identified,-1.3698,270000.0,-5.06e-06,1.0,-530000.0,530000.0
physical_cause_identified,0.0273,6004.807,4.54e-06,1.0,-11800.0,11800.0
deferral_volume_net,-1.034e-06,0.228,-4.54e-06,1.0,-0.446,0.446
year_2019,0.0046,1002.878,4.54e-06,1.0,-1965.601,1965.61


### 3. Probabilities - at 0

In [19]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='zero')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_180_365
Method:,dydx
At:,zero

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0001,0.0,-0.991,0.322,-0.0,0.0
time_between_deferral_start_and_investigation,-0.0002,0.0,-1.156,0.248,-0.0,0.0
number_of_actions,0.0039,0.015,0.266,0.79,-0.025,0.032
actions_completed_count,-0.0087,0.016,-0.536,0.592,-0.04,0.023
critical_actions_count,0.0265,0.034,0.78,0.435,-0.04,0.093
critical_actions_completed_count,-0.1055,0.158,-0.669,0.503,-0.414,0.203
systemic_cause_identified,-3.802,1160000.0,-3.29e-06,1.0,-2270000.0,2270000.0
physical_cause_identified,0.0757,0.16,0.472,0.637,-0.239,0.39
deferral_volume_net,-2.869e-06,6.92e-05,-0.041,0.967,-0.0,0.0
year_2019,0.0126,0.026,0.477,0.633,-0.039,0.065


# 1-730 days

In [20]:
model = sm.Logit(y_1_730, X)
result = model.fit(maxiter=1000)

Optimization terminated successfully.
         Current function value: 0.647358
         Iterations 7


### 1. Log-Odds

In [21]:
summary_table = result.summary()
summary_table

0,1,2,3
Dep. Variable:,recurrent_count_1_730,No. Observations:,1116.0
Model:,Logit,Df Residuals:,1091.0
Method:,MLE,Df Model:,24.0
Date:,"Mon, 07 Aug 2023",Pseudo R-squ.:,0.06557
Time:,15:23:39,Log-Likelihood:,-722.45
converged:,True,LL-Null:,-773.15
Covariance Type:,nonrobust,LLR p-value:,1.736e-11

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.3957,0.679,-0.583,0.560,-1.726,0.934
investigation_duration,-0.0016,0.001,-2.952,0.003,-0.003,-0.001
time_between_deferral_start_and_investigation,-0.0021,0.001,-3.887,0.000,-0.003,-0.001
number_of_actions,0.1124,0.108,1.038,0.299,-0.100,0.325
actions_completed_count,-0.1570,0.113,-1.393,0.164,-0.378,0.064
critical_actions_count,-0.1376,0.137,-1.005,0.315,-0.406,0.131
critical_actions_completed_count,-0.1729,0.537,-0.322,0.747,-1.225,0.880
systemic_cause_identified,-1.8499,1.281,-1.444,0.149,-4.361,0.662
physical_cause_identified,1.7700,1.210,1.462,0.144,-0.602,4.142


### 2. Probabilities - at $\mu$

In [22]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='mean')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_1_730
Method:,dydx
At:,mean

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0004,0.0,-2.952,0.003,-0.001,-0.0
time_between_deferral_start_and_investigation,-0.0005,0.0,-3.891,0.0,-0.001,-0.0
number_of_actions,0.0281,0.027,1.038,0.299,-0.025,0.081
actions_completed_count,-0.0392,0.028,-1.392,0.164,-0.094,0.016
critical_actions_count,-0.0344,0.034,-1.005,0.315,-0.101,0.033
critical_actions_completed_count,-0.0432,0.134,-0.322,0.747,-0.306,0.22
systemic_cause_identified,-0.4624,0.32,-1.444,0.149,-1.09,0.165
physical_cause_identified,0.4424,0.303,1.462,0.144,-0.151,1.035
deferral_volume_net,-8.592e-05,0.0,-0.688,0.492,-0.0,0.0
year_2019,0.0876,0.047,1.868,0.062,-0.004,0.18


### 3. Probabilities - at 0

In [23]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='mean')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_1_730
Method:,dydx
At:,mean

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0004,0.0,-2.952,0.003,-0.001,-0.0
time_between_deferral_start_and_investigation,-0.0005,0.0,-3.891,0.0,-0.001,-0.0
number_of_actions,0.0281,0.027,1.038,0.299,-0.025,0.081
actions_completed_count,-0.0392,0.028,-1.392,0.164,-0.094,0.016
critical_actions_count,-0.0344,0.034,-1.005,0.315,-0.101,0.033
critical_actions_completed_count,-0.0432,0.134,-0.322,0.747,-0.306,0.22
systemic_cause_identified,-0.4624,0.32,-1.444,0.149,-1.09,0.165
physical_cause_identified,0.4424,0.303,1.462,0.144,-0.151,1.035
deferral_volume_net,-8.592e-05,0.0,-0.688,0.492,-0.0,0.0
year_2019,0.0876,0.047,1.868,0.062,-0.004,0.18


## 90-730 days

In [24]:
model = sm.Logit(y_90_730, X)
result = model.fit(maxiter=1000)

Optimization terminated successfully.
         Current function value: 0.619796
         Iterations 8


### 1. Log-Odds

In [25]:
summary_table = result.summary()
summary_table

0,1,2,3
Dep. Variable:,recurrent_count_90_730,No. Observations:,1116.0
Model:,Logit,Df Residuals:,1091.0
Method:,MLE,Df Model:,24.0
Date:,"Mon, 07 Aug 2023",Pseudo R-squ.:,0.06599
Time:,15:23:39,Log-Likelihood:,-691.69
converged:,True,LL-Null:,-740.56
Covariance Type:,nonrobust,LLR p-value:,7.263e-11

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6994,0.694,-1.007,0.314,-2.060,0.662
investigation_duration,-0.0011,0.001,-1.847,0.065,-0.002,6.52e-05
time_between_deferral_start_and_investigation,-0.0035,0.001,-4.272,0.000,-0.005,-0.002
number_of_actions,-0.0233,0.102,-0.229,0.819,-0.223,0.176
actions_completed_count,-0.0151,0.107,-0.142,0.887,-0.224,0.194
critical_actions_count,0.0186,0.137,0.135,0.892,-0.251,0.288
critical_actions_completed_count,-0.8612,0.761,-1.131,0.258,-2.353,0.631
systemic_cause_identified,-0.9523,1.202,-0.792,0.428,-3.308,1.404
physical_cause_identified,0.4609,1.066,0.432,0.666,-1.629,2.551


### 2. Probabilities - at $\mu$

In [26]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='mean')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_90_730
Method:,dydx
At:,mean

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0002,0.0,-1.849,0.064,-0.0,1.42e-05
time_between_deferral_start_and_investigation,-0.0008,0.0,-4.606,0.0,-0.001,-0.0
number_of_actions,-0.0052,0.023,-0.229,0.819,-0.049,0.039
actions_completed_count,-0.0033,0.024,-0.142,0.887,-0.05,0.043
critical_actions_count,0.0041,0.03,0.135,0.892,-0.055,0.064
critical_actions_completed_count,-0.1907,0.168,-1.136,0.256,-0.52,0.138
systemic_cause_identified,-0.2109,0.266,-0.793,0.428,-0.732,0.31
physical_cause_identified,0.1021,0.236,0.432,0.665,-0.361,0.565
deferral_volume_net,-9.746e-06,0.0,-0.087,0.931,-0.0,0.0
year_2019,0.1186,0.042,2.854,0.004,0.037,0.2


### 3. Probabilities - at 0

In [27]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='zero')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

0,1
Dep. Variable:,recurrent_count_90_730
Method:,dydx
At:,zero

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
investigation_duration,-0.0002,0.0,-1.647,0.1,-0.001,4.5e-05
time_between_deferral_start_and_investigation,-0.0008,0.0,-3.067,0.002,-0.001,-0.0
number_of_actions,-0.0052,0.023,-0.228,0.819,-0.05,0.039
actions_completed_count,-0.0034,0.024,-0.142,0.887,-0.05,0.043
critical_actions_count,0.0041,0.031,0.135,0.893,-0.056,0.064
critical_actions_completed_count,-0.191,0.174,-1.096,0.273,-0.532,0.15
systemic_cause_identified,-0.2112,0.272,-0.777,0.437,-0.744,0.321
physical_cause_identified,0.1022,0.239,0.428,0.668,-0.365,0.57
deferral_volume_net,-9.76e-06,0.0,-0.087,0.931,-0.0,0.0
year_2019,0.1187,0.049,2.414,0.016,0.022,0.215


## 180-730

In [28]:
model = sm.Logit(y_180_730, X)
result = model.fit(maxiter=1000)

         Current function value: inf
         Iterations: 1000


LinAlgError: Singular matrix

### 1. Log-Odds

In [None]:
summary_table = result.summary()
summary_table

### 2. Probabilities - at $\mu$

In [None]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='mean')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()

### 3. Probabilities - at 0

In [None]:
# Get the marginal effects
marginal_effects = result.get_margeff(at='zero')

# Display both the original summary table and the marginal effects summary table
marginal_effects.summary()