In [5]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
import matplotlib.pylab as plt

import dmba
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

In [8]:
car_df = dmba.load_data('ToyotaCorolla.csv')
print(car_df.columns)
print(car_df.head())


Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',
       'Fuel_Type', 'HP', 'Met_Color', 'Color', 'Automatic', 'CC', 'Doors',
       'Cylinders', 'Gears', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee',
       'BOVAG_Guarantee', 'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2',
       'Airco', 'Automatic_airco', 'Boardcomputer', 'CD_Player',
       'Central_Lock', 'Powered_Windows', 'Power_Steering', 'Radio',
       'Mistlamps', 'Sport_Model', 'Backseat_Divider', 'Metallic_Rim',
       'Radio_cassette', 'Parking_Assistant', 'Tow_Bar'],
      dtype='object')
   Id                                          Model  Price  Age_08_04  \
0   1  TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors  13500         23   
1   2  TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors  13750         23   
2   3  TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors  13950         24   
3   4  TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors  14950         26   
4   5    TOYOTA Corolla 2.0 D4D HATCHB SOL 2

In [14]:
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic', 'Doors',
    'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco',
    'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']
outcome = 'Price'
# partition data
X = pd.get_dummies(car_df[predictors], drop_first=True)
y = car_df[outcome]

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=1)

car_lm = LinearRegression()
car_lm.fit(train_X, train_y)



LinearRegression()

In [15]:
# print coefficients
print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

# print performance measures
regressionSummary(train_y, car_lm.predict(train_X))

intercept  9069.847844083184
           Predictor  coefficient
0          Age_08_04  -109.758421
1                 KM    -0.018441
2                 HP    37.803435
3          Automatic   617.139156
4              Doors   144.384691
5      Quarterly_Tax    16.624936
6      Mfr_Guarantee   152.904242
7   Guarantee_Period    84.352082
8              Airco   145.472000
9    Automatic_airco  2997.221195
10         CD_Player   284.677044
11   Powered_Windows   462.393436
12       Sport_Model   391.596482
13           Tow_Bar  -221.049096
14  Fuel_Type_Diesel  2160.912503
15  Fuel_Type_Petrol  2154.276224

Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 1206.8732
            Mean Absolute Error (MAE) : 908.3384
          Mean Percentage Error (MPE) : -0.9840
Mean Absolute Percentage Error (MAPE) : 8.9836


In [16]:
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}).sort_values(by='coefficient', key=abs, ascending=False))

           Predictor  coefficient
9    Automatic_airco  2997.221195
14  Fuel_Type_Diesel  2160.912503
15  Fuel_Type_Petrol  2154.276224
3          Automatic   617.139156
11   Powered_Windows   462.393436
12       Sport_Model   391.596482
10         CD_Player   284.677044
13           Tow_Bar  -221.049096
6      Mfr_Guarantee   152.904242
8              Airco   145.472000
4              Doors   144.384691
0          Age_08_04  -109.758421
7   Guarantee_Period    84.352082
2                 HP    37.803435
5      Quarterly_Tax    16.624936
1                 KM    -0.018441


In [17]:
pred_y = car_lm.predict(train_X)

print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, car_lm))
print('AIC : ', AIC_score(train_y, pred_y, car_lm))
print('BIC : ', BIC_score(train_y, pred_y, car_lm))

adjusted r2 :  0.8855698263332518
AIC :  19585.812410144987
BIC :  19676.63638852882


In [None]:

X_sm = sm.add_constant(X)
model = sm.OLS(y, X_sm).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.889
Model:                            OLS   Adj. R-squared:                  0.888
Method:                 Least Squares   F-statistic:                     712.5
Date:                Wed, 21 May 2025   Prob (F-statistic):               0.00
Time:                        20:52:40   Log-Likelihood:                -12226.
No. Observations:                1436   AIC:                         2.449e+04
Df Residuals:                    1419   BIC:                         2.458e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             9674.4087    531.252  