<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from skimpy import skim
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

### Data load

In [122]:
df = pd.read_csv('/Users/alexeyfilichkin/Desktop/PyTorch/data/vehicle_cost.csv', sep=';')

In [123]:
df = df.drop(['Unnamed: 0'], axis=1)

In [124]:
df.head(5)

Unnamed: 0,registration_year,power,kilometer,model,gearbox,fuel_type,repaired,price
0,1993,105.0,150000,golf,manual,gasoline,no,480
1,2011,190.0,125000,no_info,manual,gasoline,yes,18300
2,2004,163.0,125000,grand,auto,gasoline,no,9800
3,2001,75.0,150000,golf,manual,gasoline,no,1500
4,2008,69.0,90000,fabia,manual,gasoline,no,3600


In [125]:
skim(df)

## Data encoding and normalization

In [126]:
ohe_columns = ['model', 'gearbox', 'fuel_type','repaired']
num_columns = ['registration_year', 'power', 'kilometer']

In [127]:
scaler = StandardScaler()
encoder = OneHotEncoder(drop='first', sparse_output=False)

In [128]:
df_ohe = encoder.fit_transform(df[ohe_columns])
encoder_col_names = encoder.get_feature_names_out()
df_ohe = pd.DataFrame(df_ohe, columns=encoder_col_names)

In [129]:
df_ohe.head(5)

Unnamed: 0,model_145,model_147,model_156,model_159,model_1_reihe,model_1er,model_200,model_2_reihe,model_300c,model_3_reihe,...,model_zafira,gearbox_manual,gearbox_no_info,fuel_type_electric,fuel_type_gasoline,fuel_type_hybrid,fuel_type_lpg,fuel_type_no_info,fuel_type_other,repaired_yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [130]:
df_scaled = scaler.fit_transform(df[num_columns])
df_scaled = pd.DataFrame(df_scaled, columns=num_columns)

In [133]:
df_scaled.head(5)

Unnamed: 0,registration_year,power,kilometer
0,-1.351624,-0.288239,0.608635
1,1.002098,1.351743,-0.038537
2,0.086762,0.830807,-0.038537
3,-0.305525,-0.867057,0.608635
4,0.609811,-0.98282,-0.944577


In [134]:
df_coded = pd.concat([df_scaled, df_ohe, df['price']], axis=1)

In [135]:
df_coded.head(5)

Unnamed: 0,registration_year,power,kilometer,model_145,model_147,model_156,model_159,model_1_reihe,model_1er,model_200,...,gearbox_manual,gearbox_no_info,fuel_type_electric,fuel_type_gasoline,fuel_type_hybrid,fuel_type_lpg,fuel_type_no_info,fuel_type_other,repaired_yes,price
0,-1.351624,-0.288239,0.608635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,480
1,1.002098,1.351743,-0.038537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,18300
2,0.086762,0.830807,-0.038537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9800
3,-0.305525,-0.867057,0.608635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1500
4,0.609811,-0.98282,-0.944577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3600


In [136]:
df_coded.shape

(282653, 263)

In [137]:
df_coded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282653 entries, 0 to 282652
Columns: 263 entries, registration_year to price
dtypes: float64(262), int64(1)
memory usage: 567.2 MB


### Data preparation for PyTorch

In [138]:
df_torch = df_coded.to_numpy()

In [210]:
df_torch.shape

(282653, 263)

In [140]:
autos = torch.from_numpy(df_torch)

In [141]:
autos = autos.to(torch.double)

In [142]:
autos.dtype

torch.float64

In [143]:
autos.shape

torch.Size([282653, 263])

In [144]:
features = autos[:, :-1]
features[:-1], features.shape

(tensor([[-1.3516, -0.2882,  0.6086,  ...,  0.0000,  0.0000,  0.0000],
         [ 1.0021,  1.3517, -0.0385,  ...,  0.0000,  0.0000,  1.0000],
         [ 0.0868,  0.8308, -0.0385,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.2175, -0.2882, -2.7567,  ...,  1.0000,  0.0000,  0.0000],
         [-0.4363, -0.3654, -0.0385,  ...,  0.0000,  0.0000,  0.0000],
         [-0.9593, -0.3461,  0.6086,  ...,  0.0000,  0.0000,  0.0000]],
        dtype=torch.float64),
 torch.Size([282653, 262]))

In [145]:
features = features.to(torch.double)

In [146]:
features.dtype

torch.float64

In [147]:
target = autos[:, -1].unsqueeze(1)
target,  target.shape

(tensor([[  480.],
         [18300.],
         [ 9800.],
         ...,
         [ 1199.],
         [ 9200.],
         [ 3400.]], dtype=torch.float64),
 torch.Size([282653, 1]))

In [148]:
target = target.to(torch.double)

In [149]:
target.dtype

torch.float64

In [150]:
n_samples = features.shape[0]
n_val = int(0.2 * n_samples)
n_samples, n_val

(282653, 56530)

features_mean = torch.mean(features, dim=0)
features_var = torch.var(features, dim=0)
features_mean, features_var

features_normalized = (features - features_mean) / torch.sqrt(features_var)
features_normalized, features_normalized.shape

In [151]:
shuffled_indices = torch.randperm(n_samples)
train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

In [152]:
y_train = target[train_indices]
X_train = features[train_indices]

y_val = target[val_indices]
X_val = features[val_indices]

In [153]:
y_train.shape, X_train.shape

(torch.Size([226123, 1]), torch.Size([226123, 262]))

In [154]:
y_val.shape, X_val.shape

(torch.Size([56530, 1]), torch.Size([56530, 262]))

In [155]:
X_train

tensor([[ 1.3944, -0.6548, -2.2389,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.6559, -0.2882,  0.6086,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.7867, -0.2882,  0.6086,  ...,  1.0000,  0.0000,  0.0000],
        ...,
        [ 0.2175, -0.3654, -1.4623,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3055, -0.7320,  0.6086,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0868, -0.8671, -0.0385,  ...,  0.0000,  0.0000,  1.0000]],
       dtype=torch.float64)

### Model creation and train

In [189]:
device = (torch.device('mps') if torch.backends.mps.is_available()
          else torch.device('cpu'))
print(f'Training on device {device}')

Training on device mps


In [190]:
torch.has_mps

  torch.has_mps


True

In [191]:
def training_loop(n_epochs, optimizer, model, loss_fn, X_train, X_val,
              y_train, y_val):
    model.to(device=device)
    X_train = X_train.to(device=device)
    X_val = X_val.to(device=device)
    y_train = y_train.to(device=device)
    y_val = y_val.to(device=device)
    for epoch in range(1, n_epochs + 1):
        pred_train = model(X_train)
        loss_train = loss_fn(pred_train, y_train)
        
        pred_val = model(X_val)
        loss_val = loss_fn(pred_val, y_val)
        
        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        if epoch == 1 or epoch % 500 == 0:
            print(f'Epoch {epoch}, Training loss {loss_train.item():.4f},'
                  f'Validation loss {loss_val.item():.4f}')

In [196]:
seq_model_autos = nn.Sequential(
    nn.Linear(262, 500),
    nn.Sigmoid(),
    nn.Linear(500, 500),
    nn.BatchNorm1d(500),
    nn.ReLU(),
    nn.Linear(500, 1)
)
optimizer = optim.SGD(seq_model_autos.parameters(), lr=0.01)

In [197]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()

    def forward(self, x, y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x, y))
        return loss

In [198]:
seq_model_autos

Sequential(
  (0): Linear(in_features=262, out_features=500, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=500, out_features=500, bias=True)
  (3): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): ReLU()
  (5): Linear(in_features=500, out_features=1, bias=True)
)

In [199]:
training_loop(
    n_epochs=5000,
    optimizer=optimizer,
    model=seq_model_autos,
    loss_fn=RMSELoss(),
    X_train=X_train.float(),
    X_val=X_val.float(),
    y_train=y_train.float(),
    y_val=y_val.float()
)

Epoch 1, Training loss 6585.5625,Validation loss 6599.3320
Epoch 500, Training loss 2282.4141,Validation loss 2307.5881
Epoch 1000, Training loss 2032.6580,Validation loss 2060.1404
Epoch 1500, Training loss 1950.7810,Validation loss 1983.1288
Epoch 2000, Training loss 1926.8907,Validation loss 1968.0931
Epoch 2500, Training loss 1915.1644,Validation loss 1954.7045
Epoch 3000, Training loss 1875.3187,Validation loss 1926.0717
Epoch 3500, Training loss 1866.2634,Validation loss 1920.2349
Epoch 4000, Training loss 1858.9139,Validation loss 1913.3127
Epoch 4500, Training loss 1845.7375,Validation loss 1906.8220
Epoch 5000, Training loss 1841.5635,Validation loss 1903.2480


In [200]:
torch.save(seq_model_autos.state_dict(), 'vehicle_cost.pt')

In [201]:
model = torch.load('vehicle_cost.pt', weights_only=True)

In [209]:
for layer_name, weights in model.items():
    print(layer_name, weights.shape)

0.weight torch.Size([500, 262])
0.bias torch.Size([500])
2.weight torch.Size([500, 500])
2.bias torch.Size([500])
3.weight torch.Size([500])
3.bias torch.Size([500])
3.running_mean torch.Size([500])
3.running_var torch.Size([500])
3.num_batches_tracked torch.Size([])
5.weight torch.Size([1, 500])
5.bias torch.Size([1])
