## Import libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import boxcox
from scipy.special import inv_boxcox
pd.set_option('display.float_format', '{:.3f}'.format)

## Helper function to generate report

In [2]:
def getQoF(mod, cname, key, lam=0.2):
    
    y_true = mod.model.data.frame[cname].values
    y_pred = mod.fittedvalues
    
    if key == "orig":
        pass
    elif key == "sqrt":
        y_pred = y_pred**2
    elif key == "log1p":
        y_pred = np.expm1(y_pred)
    elif key == "boxcox":
        y_pred = inv_boxcox(y_pred, lam)

    mae = np.mean(np.abs(y_true - y_pred))
    
    smape = 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

    metrics = {
        "rSq": mod.rsquared,
        "rSqBar": mod.rsquared_adj,
        "sst": mod.centered_tss,
        "sse": mod.ssr,
        "sde": np.sqrt(mod.mse_resid),
        "mse0": mod.mse_resid,
        "rmse": np.sqrt(mod.mse_resid),
        "mae": mae,
        "smape": smape,
        "m": mod.nobs,
        "dfr": mod.df_model,
        "df": mod.df_resid,
        "fStat": mod.fvalue,
        "aic": mod.aic,
        "bic": mod.bic
    }
    return metrics

# Auto MPG: Square-Root, Log1p, and Box–Cox Transformations

### Data Loading and Preprocessing

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import boxcox
from scipy.special import inv_boxcox

auto_mpg = pd.read_csv("dt/auto-mpg.csv",na_values="?", on_bad_lines="skip").dropna()
auto_mpg.rename(columns={"model year": "model_year"}, inplace=True)
auto_mpg.drop(columns=['car name'], inplace=True)
auto_mpg["mpg"] = auto_mpg.pop("mpg")

auto_mpg['mpg_sqrt']  = np.sqrt(auto_mpg['mpg'])
auto_mpg['mpg_log1p'] = np.log1p(auto_mpg['mpg'])
auto_mpg['mpg_boxcox'] = boxcox(auto_mpg['mpg'], 0.20)

auto_mpg = sm.add_constant(auto_mpg)

auto_mpg.head()

Unnamed: 0,const,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg,mpg_sqrt,mpg_log1p,mpg_boxcox
0,1.0,8,307.0,130.0,3504,12.0,70,1,18.0,4.243,2.944,3.913
1,1.0,8,350.0,165.0,3693,11.5,70,1,15.0,3.873,2.773,3.594
2,1.0,8,318.0,150.0,3436,11.0,70,1,18.0,4.243,2.944,3.913
3,1.0,8,304.0,150.0,3433,12.0,70,1,16.0,4.0,2.833,3.706
4,1.0,8,302.0,140.0,3449,10.5,70,1,17.0,4.123,2.89,3.812


### Auto MPG Regression: Square-Root Transformation

In [4]:
model_sqrt = smf.ols(formula = 'mpg_sqrt ~ const + cylinders + displacement + horsepower + weight + acceleration + model_year', data=auto_mpg).fit()
print(model_sqrt.summary())

                            OLS Regression Results                            
Dep. Variable:               mpg_sqrt   R-squared:                       0.848
Model:                            OLS   Adj. R-squared:                  0.845
Method:                 Least Squares   F-statistic:                     357.1
Date:                Sun, 15 Feb 2026   Prob (F-statistic):          6.43e-154
Time:                        00:30:30   Log-Likelihood:                -100.91
No. Observations:                 392   AIC:                             215.8
Df Residuals:                     385   BIC:                             243.6
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.6521      0.219      2.978   

### Auto MPG Regression: Log1p Transformation

In [5]:
model_log1p = smf.ols(formula = 'mpg_log1p ~ const + cylinders + displacement + horsepower + weight + acceleration + model_year', data=auto_mpg).fit()
print(model_log1p.summary())

                            OLS Regression Results                            
Dep. Variable:              mpg_log1p   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.871
Method:                 Least Squares   F-statistic:                     439.2
Date:                Sun, 15 Feb 2026   Prob (F-statistic):          8.87e-169
Time:                        00:30:30   Log-Likelihood:                 289.19
No. Observations:                 392   AIC:                            -564.4
Df Residuals:                     385   BIC:                            -536.6
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.9573      0.081     11.825   

### Auto MPG Regression: Box-Cox Transformation 

In [6]:
model_boxcox = smf.ols(formula = 'mpg_boxcox ~ const + cylinders + displacement + horsepower + weight + acceleration + model_year', data=auto_mpg).fit()
print(model_boxcox.summary())

                            OLS Regression Results                            
Dep. Variable:             mpg_boxcox   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.863
Method:                 Least Squares   F-statistic:                     411.9
Date:                Sun, 15 Feb 2026   Prob (F-statistic):          4.01e-164
Time:                        00:30:30   Log-Likelihood:                 17.852
No. Observations:                 392   AIC:                            -21.70
Df Residuals:                     385   BIC:                             6.096
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.9032      0.162      5.583   

## Report 

In [7]:
results = pd.DataFrame({
    "Sqrt":   getQoF(model_sqrt,  "mpg", "sqrt"),
    "Log1p":  getQoF(model_log1p, "mpg", "log1p"),
    "BoxCox": getQoF(model_boxcox,"mpg", "boxcox", lam=0.2)
})

print(results)

          Sqrt    Log1p  BoxCox
rSq      0.848    0.873   0.865
rSqBar   0.845    0.871   0.863
sst    252.161   41.173 155.463
sse     38.407    5.248  20.953
sde      0.316    0.117   0.233
mse0     0.100    0.014   0.054
rmse     0.316    0.117   0.233
mae      2.347    2.184   2.233
smape   10.205    9.320   9.562
m      392.000  392.000 392.000
dfr      6.000    6.000   6.000
df     385.000  385.000 385.000
fStat  357.118  439.218 411.913
aic    215.825 -564.387 -21.703
bic    243.623 -536.589   6.096


# Boston House Price: Square-Root, Log1p, and Box–Cox Transformations

### Data Loading and Preprocessing

In [8]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import boxcox
from scipy.special import inv_boxcox

house_price = pd.read_csv("boston_house_prices.csv").dropna()
house_price['House_Price_Sqrt'] = np.sqrt(house_price['House_Price'])
house_price['House_Price_Log1p'] = np.log1p(house_price['House_Price'])
house_price['House_Price_Boxcox'] = boxcox(house_price['House_Price'], 0.85)
house_price = sm.add_constant(house_price)

house_price.head()

Unnamed: 0,const,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price,House_Price_Sqrt,House_Price_Log1p,House_Price_Boxcox
0,1.0,1360,2,1,1981,0.6,0,5,262382.852,512.233,12.478,47496.92
1,1.0,4272,3,3,2016,4.753,1,6,985260.854,992.603,13.801,146250.084
2,1.0,3592,1,2,2016,3.635,0,9,777977.39,882.03,13.564,119646.151
3,1.0,966,1,2,1977,2.731,1,8,229698.919,479.269,12.345,42418.386
4,1.0,4926,2,1,1993,4.699,0,8,1041740.859,1020.657,13.856,153346.365


### Boston House Price Regression: Square-Root Transformation

In [9]:
model_sqrt = smf.ols(formula = 'House_Price_Sqrt ~ const + Square_Footage + Num_Bedrooms + \
                                Num_Bathrooms + Year_Built + Lot_Size + Garage_Size + \
                                Neighborhood_Quality', data=house_price).fit()
print(model_sqrt.summary())

                            OLS Regression Results                            
Dep. Variable:       House_Price_Sqrt   R-squared:                       0.986
Model:                            OLS   Adj. R-squared:                  0.985
Method:                 Least Squares   F-statistic:                     9663.
Date:                Sun, 15 Feb 2026   Prob (F-statistic):               0.00
Time:                        00:30:30   Log-Likelihood:                -4445.7
No. Observations:                1000   AIC:                             8907.
Df Residuals:                     992   BIC:                             8947.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept             -543.3017 

### Boston House Price Regression: Log1p Transformation

In [10]:
model_log1p = smf.ols(formula = 'House_Price_Log1p ~ const + Square_Footage + Num_Bedrooms + \
                                Num_Bathrooms + Year_Built + Lot_Size + Garage_Size + \
                                Neighborhood_Quality', data=house_price).fit()
print(model_log1p.summary())

                            OLS Regression Results                            
Dep. Variable:      House_Price_Log1p   R-squared:                       0.941
Model:                            OLS   Adj. R-squared:                  0.941
Method:                 Least Squares   F-statistic:                     2280.
Date:                Sun, 15 Feb 2026   Prob (F-statistic):               0.00
Time:                        00:30:30   Log-Likelihood:                 710.22
No. Observations:                1000   AIC:                            -1404.
Df Residuals:                     992   BIC:                            -1365.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept                3.8168 

### Boston House Price Regression: Box-Cox Transformation 

In [11]:
model_boxcox = smf.ols(formula = 'House_Price_Boxcox ~ const + Square_Footage + Num_Bedrooms + \
                                Num_Bathrooms + Year_Built + Lot_Size + Garage_Size + \
                                Neighborhood_Quality', data=house_price).fit()
print(model_boxcox.summary())

                            OLS Regression Results                            
Dep. Variable:     House_Price_Boxcox   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.997
Method:                 Least Squares   F-statistic:                 5.648e+04
Date:                Sun, 15 Feb 2026   Prob (F-statistic):               0.00
Time:                        00:30:30   Log-Likelihood:                -8879.9
No. Observations:                1000   AIC:                         1.778e+04
Df Residuals:                     992   BIC:                         1.782e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept            -1.335e+05 

## Report 

In [12]:
results = pd.DataFrame({
    "Sqrt":   getQoF(model_sqrt,   "House_Price", "sqrt"),
    "Log1p":  getQoF(model_log1p,  "House_Price", "log1p"),
    "BoxCox": getQoF(model_boxcox, "House_Price", "boxcox", lam=0.85)
})
print(results)

               Sqrt     Log1p            BoxCox
rSq           0.986     0.941             0.997
rSqBar        0.985     0.941             0.997
sst    29445342.066   241.749 1208110405890.069
sse      425598.998    14.146    3023790748.106
sde          20.713     0.119          1745.903
mse0        429.031     0.014       3048176.157
rmse         20.713     0.119          1745.903
mae       24296.468 52989.293         10078.898
smape         4.858     9.212             2.124
m          1000.000  1000.000          1000.000
dfr           7.000     7.000             7.000
df          992.000   992.000           992.000
fStat      9662.880  2280.114         56478.111
aic        8907.375 -1404.442         17775.899
bic        8946.637 -1365.180         17815.161


# Medical Cost : Square-Root, Log1p, and Box–Cox Transformations

### Data Loading and Preprocessing

In [13]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import boxcox
from scipy.special import inv_boxcox

insurance = pd.read_csv("insurance_cat2num.csv").dropna()
insurance['charges_sqrt'] = np.sqrt(insurance['charges'])
insurance['charges_log1p'] = np.log1p(insurance['charges'])
insurance['charges_boxcox'] = boxcox(insurance['charges'], 1.31)

insurance.head()

Unnamed: 0,intercept,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,charges,charges_sqrt,charges_log1p,charges_boxcox
0,1,19,27.9,0,0,1,0,0,1,16884.924,129.942,9.734,263482.2
1,1,18,33.77,1,1,0,0,1,0,1725.552,41.54,7.454,13276.377
2,1,28,33.0,3,1,0,0,1,0,4449.462,66.704,8.401,45920.257
3,1,33,22.705,0,1,0,1,0,0,21984.471,148.272,9.998,372305.79
4,1,32,28.88,0,1,0,1,0,0,3866.855,62.184,8.26,38208.403


### Medical Cost Regression: Square-Root Transformation

In [14]:
model_sqrt = smf.ols(formula = 'charges_sqrt ~ intercept + age + bmi + \
                     children + sex_male + smoker_yes + region_northwest +\
                     region_southeast + region_southwest', data=insurance).fit()
print(model_sqrt.summary())

                            OLS Regression Results                            
Dep. Variable:           charges_sqrt   R-squared:                       0.780
Model:                            OLS   Adj. R-squared:                  0.778
Method:                 Least Squares   F-statistic:                     587.4
Date:                Sun, 15 Feb 2026   Prob (F-statistic):               0.00
Time:                        00:30:30   Log-Likelihood:                -6059.7
No. Observations:                1338   AIC:                         1.214e+04
Df Residuals:                    1329   BIC:                         1.218e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.3218      1.833  

### Medical Cost Regression: Log1p Transformation

In [15]:
model_log1p = smf.ols(formula = 'charges_log1p ~ intercept + age + bmi + \
                     children + sex_male + smoker_yes + region_northwest +\
                     region_southeast + region_southwest', data=insurance).fit()
print(model_log1p.summary())

                            OLS Regression Results                            
Dep. Variable:          charges_log1p   R-squared:                       0.768
Model:                            OLS   Adj. R-squared:                  0.767
Method:                 Least Squares   F-statistic:                     549.8
Date:                Sun, 15 Feb 2026   Prob (F-statistic):               0.00
Time:                        00:30:30   Log-Likelihood:                -808.27
No. Observations:                1338   AIC:                             1635.
Df Residuals:                    1329   BIC:                             1681.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            3.5155      0.036  

### Medical Cost Regression: Box-Cox  Transformation

In [16]:
model_boxcox = smf.ols(formula = 'charges_boxcox ~ intercept + age + bmi + \
                     children + sex_male + smoker_yes + region_northwest +\
                     region_southeast + region_southwest', data=insurance).fit()
print(model_boxcox.summary())

                            OLS Regression Results                            
Dep. Variable:         charges_boxcox   R-squared:                       0.721
Model:                            OLS   Adj. R-squared:                  0.720
Method:                 Least Squares   F-statistic:                     430.3
Date:                Sun, 15 Feb 2026   Prob (F-statistic):               0.00
Time:                        00:30:30   Log-Likelihood:                -17739.
No. Observations:                1338   AIC:                         3.550e+04
Df Residuals:                    1329   BIC:                         3.554e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept        -1.567e+05   1.13e+04  

## Report 

In [17]:
results = pd.DataFrame({
    "Sqrt":   getQoF(model_sqrt,   "charges", "sqrt"),
    "Log1p":  getQoF(model_log1p,  "charges", "log1p"),
    "BoxCox": getQoF(model_boxcox, "charges", "boxcox", lam=1.31)
})
print(results)

              Sqrt    Log1p             BoxCox
rSq          0.780    0.768              0.721
rSqBar       0.778    0.767              0.720
sst    3051091.513 1130.110 92203700874799.344
sse     672635.971  262.232 25679991017120.289
sde         22.497    0.444         139006.447
mse0       506.122    0.197    19322792337.939
rmse        22.497    0.444         139006.447
mae       3613.896 4219.512           4805.541
smape       27.690   26.289             39.520
m         1338.000 1338.000           1338.000
dfr          8.000    8.000              8.000
df        1329.000 1329.000           1329.000
fStat      587.422  549.805            430.345
aic      12137.477 1634.536          35495.979
bic      12184.268 1681.327          35542.769
