In [9]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
import matplotlib.pylab as plt

import dmba
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

%matplotlib inline

In [8]:
pip install dmba

Collecting dmba
  Downloading dmba-0.2.3-py3-none-any.whl (11.8 MB)
     --------------------------------------- 11.8/11.8 MB 11.5 MB/s eta 0:00:00
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 47.0/47.0 kB 2.3 MB/s eta 0:00:00
Installing collected packages: graphviz, dmba
Successfully installed dmba-0.2.3 graphviz-0.20.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
df=pd.read_csv('https://raw.githubusercontent.com/Fangda2023/schulich_data_science/main/ToyotaCorolla.csv')

In [4]:
df.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,0,1,0,1,0,0,0,0


In [6]:
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 
              'Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco', 
              'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model','Tow_Bar']
outcome='Price'

In [10]:
# partition data
X = pd.get_dummies(df[predictors], drop_first=True)
y = df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=1)
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

# print coefficients
print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

# print performance measures
regressionSummary(train_y, car_lm.predict(train_X))


intercept  9080.074614390975
           Predictor  coefficient
0          Age_08_04  -109.741849
1                 KM    -0.018450
2                 HP    37.809864
3          Met_Color   -20.558356
4          Automatic   615.425879
5              Doors   144.971812
6      Quarterly_Tax    16.600914
7      Mfr_Guarantee   155.703050
8   Guarantee_Period    84.445082
9              Airco   145.489158
10   Automatic_airco  2995.728348
11         CD_Player   288.465590
12   Powered_Windows   464.602239
13       Sport_Model   392.370247
14           Tow_Bar  -217.784386
15  Fuel_Type_Diesel  2162.437648
16  Fuel_Type_Petrol  2151.692542

Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 1206.8383
            Mean Absolute Error (MAE) : 908.2321
          Mean Percentage Error (MPE) : -0.9839
Mean Absolute Percentage Error (MAPE) : 8.9827


In [11]:
pred_y = car_lm.predict(train_X)

print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, car_lm))
print('AIC : ', AIC_score(train_y, pred_y, car_lm))
print('BIC : ', BIC_score(train_y, pred_y, car_lm))

adjusted r2 :  0.8854751703807532
AIC :  19587.74615636583
BIC :  19683.61591132654
