In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

# Load the data
file_path = 'input.csv'
data = pd.read_csv(file_path)

# Apply log transformation with small constant to handle zero values
data['Log_MPA'] = np.log(data['MPA'] + 1e-10)
data['Log_Fishery_Consumption'] = np.log(data['FisheryConsumption'] + 1e-10)
data['Log_Share_Plastic_Pollution'] = np.log(data['Share of global plastics emitted to ocean'] + 1e-10)

# Display the transformed data
print(data[['Log_MPA', 'Log_Fishery_Consumption', 'Log_Share_Plastic_Pollution']].head())

# Define the independent variables
X = data[['Log_MPA', 'Log_Share_Plastic_Pollution']]

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Calculate VIF for each variable
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)

# Scale the independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data[['Log_MPA', 'Log_Share_Plastic_Pollution']])

# Add a constant to the scaled data
X_scaled = sm.add_constant(X_scaled)

# Fit the OLS model on the scaled data
model_scaled = sm.OLS(data['Log_Fishery_Consumption'], X_scaled).fit()

# Print the regression results
print(model_scaled.summary())


    Log_MPA  Log_Fishery_Consumption  Log_Share_Plastic_Pollution
0 -5.298317                12.903451                    -2.432652
1  1.043452                 9.616572                    -1.833943
2  2.422144                11.229992                    -6.550527
3  2.465384                13.627598                    -0.861858
4 -1.703749                 8.066208                    -8.496435
                       feature       VIF
0                        const  1.689234
1                      Log_MPA  1.000010
2  Log_Share_Plastic_Pollution  1.000010
                               OLS Regression Results                              
Dep. Variable:     Log_Fishery_Consumption   R-squared:                       0.111
Model:                                 OLS   Adj. R-squared:                  0.097
Method:                      Least Squares   F-statistic:                     7.981
Date:                     Tue, 30 Jul 2024   Prob (F-statistic):           0.000541
Time:               