In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error as MSE

In [8]:
df = pd.read_csv('data/auto.csv')
df.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [9]:
df.isna().sum()

mpg       0
displ     0
hp        0
weight    0
accel     0
origin    0
size      0
dtype: int64

In [10]:
df.origin.value_counts()

US        245
Asia       79
Europe     68
Name: origin, dtype: int64

In [11]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Asia,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,0,1
2,36.1,91.0,60,1800,16.4,10.0,1,0,0
3,18.5,250.0,98,3525,19.0,15.0,0,0,1
4,34.3,97.0,78,2188,15.8,10.0,0,1,0


In [12]:
X = df.drop('mpg', axis=1).values
y = df['mpg'].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [15]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(313, 8)
(79, 8)
(313,)
(79,)


In [17]:
dt = DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13)
dt.fit(X_train, y_train)

In [19]:
y_pred = dt.predict(X_test)
y_pred

array([34.32181818, 24.85714286, 19.27848101, 19.27848101, 14.19722222,
       24.85714286, 19.27848101, 19.27848101, 19.27848101, 14.19722222,
       34.32181818, 34.32181818, 28.74318182, 14.19722222, 19.27848101,
       28.74318182, 19.27848101, 19.27848101, 28.74318182, 19.27848101,
       24.85714286, 28.74318182, 28.74318182, 19.27848101, 24.85714286,
       24.85714286, 19.27848101, 34.32181818, 28.74318182, 24.85714286,
       14.19722222, 24.85714286, 19.27848101, 34.32181818, 34.32181818,
       28.74318182, 24.85714286, 19.27848101, 34.32181818, 34.32181818,
       19.27848101, 19.27848101, 14.19722222, 28.74318182, 19.27848101,
       34.32181818, 24.85714286, 24.85714286, 24.85714286, 24.85714286,
       34.32181818, 34.32181818, 34.32181818, 34.32181818, 24.85714286,
       19.27848101, 34.32181818, 28.74318182, 14.19722222, 19.27848101,
       28.74318182, 19.27848101, 24.85714286, 34.32181818, 14.19722222,
       24.85714286, 14.19722222, 24.85714286, 14.19722222, 28.74

In [20]:
mse_dt = MSE(y_test, y_pred)
mse_dt

18.26526661083397

In [21]:
rmse_dt = mse_dt**(1/2)
rmse_dt

4.2737883207798175

### Evaluate the 10-fold CV error

In [23]:
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.26)

In [24]:
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
MSE_CV_scores

array([21.32310243, 46.43684117, 16.81237709, 18.15261489, 20.36743003,
       19.54918249, 44.85048344, 21.56019491, 20.8579366 , 23.77877111])

In [25]:
RMSE_CV = (MSE_CV_scores.mean())**(1/2)
RMSE_CV

5.036754253967449

### Evaluate training error

In [26]:
dt.fit(X_train, y_train)

In [28]:
y_pred_train = dt.predict(X_train)
RMSE_train = (MSE(y_train, y_pred_train))**(1/2)
RMSE_train

5.110036427329599

### Evaluating model complexity

In [29]:
y_pred_test = dt.predict(X_test)

In [39]:
print('CV MSE: {:.2f}'.format(MSE_CV_scores.mean()))
print('Train MSE: {:.2f}'.format(MSE(y_train, y_pred_train)))
print('Test MSE: {:.2f}'.format(MSE(y_test, y_pred_test)))

CV MSE: 25.37
Train MSE: 26.11
Test MSE: 23.49
