In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error as MSE
import xgboost as xgb

In [2]:
df = pd.read_csv('data/auto.csv')
df.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [3]:
df.isna().sum()

mpg       0
displ     0
hp        0
weight    0
accel     0
origin    0
size      0
dtype: int64

In [4]:
df.origin.value_counts()

US        245
Asia       79
Europe     68
Name: origin, dtype: int64

In [5]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Asia,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,0,1
2,36.1,91.0,60,1800,16.4,10.0,1,0,0
3,18.5,250.0,98,3525,19.0,15.0,0,0,1
4,34.3,97.0,78,2188,15.8,10.0,0,1,0


In [6]:
X = df.drop('mpg', axis=1).values
y = df['mpg'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(313, 8)
(79, 8)
(313,)
(79,)


In [9]:
dt = DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13)
dt.fit(X_train, y_train)

In [10]:
y_pred = dt.predict(X_test)
y_pred

array([34.32181818, 24.85714286, 19.27848101, 19.27848101, 14.19722222,
       24.85714286, 19.27848101, 19.27848101, 19.27848101, 14.19722222,
       34.32181818, 34.32181818, 28.74318182, 14.19722222, 19.27848101,
       28.74318182, 19.27848101, 19.27848101, 28.74318182, 19.27848101,
       24.85714286, 28.74318182, 28.74318182, 19.27848101, 24.85714286,
       24.85714286, 19.27848101, 34.32181818, 28.74318182, 24.85714286,
       14.19722222, 24.85714286, 19.27848101, 34.32181818, 34.32181818,
       28.74318182, 24.85714286, 19.27848101, 34.32181818, 34.32181818,
       19.27848101, 19.27848101, 14.19722222, 28.74318182, 19.27848101,
       34.32181818, 24.85714286, 24.85714286, 24.85714286, 24.85714286,
       34.32181818, 34.32181818, 34.32181818, 34.32181818, 24.85714286,
       19.27848101, 34.32181818, 28.74318182, 14.19722222, 19.27848101,
       28.74318182, 19.27848101, 24.85714286, 34.32181818, 14.19722222,
       24.85714286, 14.19722222, 24.85714286, 14.19722222, 28.74

In [11]:
mse_dt = MSE(y_test, y_pred)
mse_dt

18.265266610833958

In [12]:
rmse_dt = mse_dt**(1/2)
rmse_dt

4.273788320779816

### Evaluate the 10-fold CV error

In [13]:
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.26)

In [14]:
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
MSE_CV_scores

array([21.32310243, 46.43684117, 16.81237709, 18.15261489, 20.36743003,
       19.54918249, 44.85048344, 21.56019491, 20.8579366 , 23.77877111])

In [15]:
RMSE_CV = (MSE_CV_scores.mean())**(1/2)
RMSE_CV

5.036754253967449

### Evaluate training error

In [16]:
dt.fit(X_train, y_train)

In [17]:
y_pred_train = dt.predict(X_train)
RMSE_train = (MSE(y_train, y_pred_train))**(1/2)
RMSE_train

5.1100364273296

### Evaluating model complexity

In [18]:
y_pred_test = dt.predict(X_test)

In [19]:
print('CV MSE: {:.2f}'.format(MSE_CV_scores.mean()))
print('Train MSE: {:.2f}'.format(MSE(y_train, y_pred_train)))
print('Test MSE: {:.2f}'.format(MSE(y_test, y_pred_test)))

CV MSE: 25.37
Train MSE: 26.11
Test MSE: 23.49


### XGBoost

In [20]:
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10)
xg_cl.fit(X_train, y_train)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112], got [ 9.  10.  11.  12.  13.  14.  14.5 15.  15.5 16.  16.2 16.5 16.9 17.
 17.5 17.6 17.7 18.  18.1 18.2 18.5 18.6 19.  19.2 19.4 19.8 20.  20.2
 20.3 20.5 20.6 20.8 21.  21.1 21.5 21.6 22.  22.4 22.5 23.  23.2 23.5
 23.7 23.8 23.9 24.  24.2 24.3 25.  25.1 25.4 25.5 25.8 26.  26.4 26.5
 26.6 26.8 27.  27.2 27.4 27.9 28.  28.1 28.4 28.8 29.  29.5 29.8 29.9
 30.  30.5 30.9 31.  31.3 31.5 31.6 31.8 32.  32.1 32.3 32.4 32.7 32.8
 33.  33.5 33.7 33.8 34.  34.1 34.7 35.  35.1 35.7 36.  36.1 36.4 37.
 37.2 37.3 38.  38.1 39.  39.1 39.4 40.8 41.5 43.1 43.4 44.  44.3 44.6
 46.6]