In [23]:
import torch
import torch.nn as nn

**Mini Project: Predicting the fuel efficiency of a car:**

![auto_mpg](../../data/report_images/auto_mpg.png)

Load the dataset and preprocess the data:

In [1]:
import pandas as pd

In [2]:
url = 'http://archive.ics.uci.edu/ml/'\
      'machine-learning-databases/auto-mpg/auto-mpg.data'

In [3]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv(url, names=column_names, na_values="?", comment="\t", sep=" ", skipinitialspace=True)

In [4]:
## drop the NA rows
df = df.dropna()
df = df.reset_index(drop=True)

In [6]:
# train test splits
import sklearn
from sklearn.model_selection import train_test_split

In [7]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=1)

In [8]:
train_stats = df_train.describe().transpose()

In [9]:
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [10]:
numeric_column_names = ['Cylinders','Displacement','Horsepower','Weight','Acceleration']

In [19]:
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name,'mean']
    std = train_stats.loc[col_name,'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean)/std;
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/std;

df_train_norm.tail()

  0.3511267  -0.8243028  -0.8243028  -0.8243028   0.3511267  -0.8243028
  0.3511267   1.52655621  1.52655621  1.52655621  0.3511267   1.52655621
 -0.8243028   0.3511267   1.52655621 -0.8243028  -0.8243028   0.3511267
 -0.8243028  -0.8243028  -0.8243028   0.3511267  -0.8243028   1.52655621
  0.3511267  -0.8243028   0.3511267  -0.8243028  -0.8243028   1.52655621
 -0.8243028   1.52655621  1.52655621 -0.8243028  -0.8243028  -0.8243028
 -0.8243028   0.3511267  -0.8243028   1.52655621 -0.8243028  -0.8243028
  1.52655621 -0.8243028  -0.8243028  -0.8243028   1.52655621  1.52655621
  0.3511267   0.3511267   1.52655621 -0.8243028  -0.8243028   1.52655621
  1.52655621 -0.8243028  -0.8243028   0.3511267   1.52655621 -0.8243028
  0.3511267  -0.8243028   1.52655621  1.52655621 -0.8243028  -0.8243028
 -1.41201755  1.52655621  0.3511267   1.52655621 -0.8243028  -0.8243028
 -0.8243028   1.52655621  1.52655621  0.3511267   0.3511267   1.52655621
 -0.8243028   1.52655621 -0.23658805 -0.8243028  -0.824302

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


Just standardised the numeric columns as they all contained values on different scales and these had to be harmonised.

Next, let's group the rather fine-grained model year (ModelYear) information to buckets to simplify the learning task for the model.

In [21]:
df_train_norm['Model Year'].min(), df_train_norm['Model Year'].max()

(70, 82)

We assign the model years to buckets as follows:
- this is done here purely for illustrative purposes.

In [24]:
boundaries = torch.tensor([73, 76, 79])

v = torch.tensor(df_train_norm['Model Year'].values)

df_train_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries, right=True)

In [27]:
v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries, right=True)

In [28]:
numeric_column_names.append('Model Year Bucketed')

We added this bucketized feature column to the python list numeric_column_names. 
- Next, we proceed with defining a list for the unordered categorical feature origin. PyTorch - two ways of working with a categorical feature: using an embedding layer via nn.Embedding, or using one-hot-encoded vectors. 
- we choose to use one-hot-encoding here.

In [29]:
from torch.nn.functional import one_hot

In [36]:
total_origin = len(set(df_train_norm['Origin']))

origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values)%total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)
x_train = torch.cat([x_train_numeric, origin_encoded], 1).float()
origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values)%total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded],1).float()

In [37]:
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

**Train a DNN regression model:**

In [39]:
from torch.utils.data import TensorDataset, DataLoader

In [40]:
train_ds = TensorDataset(x_train, y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

Next, we build the model with 2 fully connected layers where one has 8 hidden units and the other has 4:

In [41]:
hidden_units = [8, 4]
input_size = x_train.shape[1]
all_layers = []

for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    nn.init.kaiming_uniform_(layer.weight)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

In [42]:
all_layers.append(nn.Linear(hidden_units[-1],1))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [43]:
# we define the MSE loss function for regression and use stochastic gradient descent for optimization
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

Now, we train the model for 200 epochs, and display the loss every 20 epochs:

In [44]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20

In [45]:
for epoch in range(num_epochs):
    loss_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)[:,0]
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs == 0:
        print(f'Epoch {epoch} Loss {loss_hist_train/len(train_dl):.4f}')

Epoch 0 Loss 231.3126
Epoch 20 Loss 8.5054
Epoch 40 Loss 7.7574
Epoch 60 Loss 7.6146
Epoch 80 Loss 6.9847
Epoch 100 Loss 6.9285
Epoch 120 Loss 6.3054
Epoch 140 Loss 6.2580
Epoch 160 Loss 7.0860
Epoch 180 Loss 6.0028


In [46]:
with torch.no_grad():
    pred = model(x_test.float())[:,0]
    loss = loss_fn(pred, y_test)
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {nn.L1Loss()(pred, y_test).item():.4f}')


Test MSE: 9.6803
Test MAE: 2.2332
