# OLS example

In [8]:
import sys
sys.path.append('..')  # Add the parent directory to sys.path

from reg_stat_inference import (
    treat_regression_model,
    treat_multicollinearity,
    treat_pvalue
)

import pandas as pd
import statsmodels.api as sm
from sklearn.datasets import (fetch_california_housing, load_diabetes)


In [2]:
# Load the California housing dataset from scikit-learn
data = fetch_california_housing(as_frame= True)
X = data.data
y = data.target
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [3]:
# Treat multicollinearity
result_multicollinearity = treat_multicollinearity(X, y, threshhold_vif = 5, reg_type='OLS')

# Print the model summary
print(result_multicollinearity.model.summary())

                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                     3498.
Date:                Wed, 23 Aug 2023   Prob (F-statistic):               0.00
Time:                        19:13:56   Log-Likelihood:                -24167.
No. Observations:               20640   AIC:                         4.835e+04
Df Residuals:                   20632   BIC:                         4.841e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6855      0.100      6.853      0.0

In [4]:
# Treat p_values
result_p_values= treat_pvalue(X, y, threshold_pval= 0.05, reg_type='OLS')

# Print the model summary
print(result_p_values.model.summary())

                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     4538.
Date:                Wed, 23 Aug 2023   Prob (F-statistic):               0.00
Time:                        19:13:56   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.526e+04
Df Residuals:                   20632   BIC:                         4.533e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -36.9175      0.658    -56.085      0.0

In [5]:
# Do all in the same time
result = treat_regression_model(X, y, threshhold_vif = 5, threshold_pval= 0.05, reg_type='OLS')

# Print the model summary
print(result.model.summary())

                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                     3498.
Date:                Wed, 23 Aug 2023   Prob (F-statistic):               0.00
Time:                        19:13:57   Log-Likelihood:                -24167.
No. Observations:               20640   AIC:                         4.835e+04
Df Residuals:                   20632   BIC:                         4.841e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6855      0.100      6.853      0.0

# Logistic regression case

In [10]:
# Load the diabetes dataset
data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Convert target values to binary labels for logistic regression
y_binary = (y > y.mean()).astype(int)

# Use the treat_regression_model function with logistic regression
result = treat_regression_model(X, pd.DataFrame({'target': y_binary}), threshhold_vif=10, threshold_pval=0.1, reg_type='logit')

print(result.model.summary())


                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  442
Model:                          Logit   Df Residuals:                      436
Method:                           MLE   Df Model:                            5
Date:                Wed, 23 Aug 2023   Pseudo R-squ.:                  0.2990
Time:                        19:21:36   Log-Likelihood:                -212.63
converged:                       True   LL-Null:                       -303.31
Covariance Type:            nonrobust   LLR p-value:                 2.749e-37
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.3411      0.121     -2.815      0.005      -0.579      -0.104
sex           -9.1309      2.939     -3.107      0.002     -14.891      -3.371
bmi           12.1910      3.110      3.919      0.0