In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('vehicles_1.csv',
                 dtype={
                     'region': 'object',
                     'price': 'int32',
                     'year': 'Int16',
                     'manufacturer': 'object',
                     'model': 'object',
                     'condition': 'object',
                     'cylinders': 'Int16',
                     'fuel': 'object',
                     'odometer': 'Int32',
                     'title_status': 'object',
                     'transmission': 'object',
                     'drive': 'object',
                     'type': 'object',
                     'paint_color': 'object',
                     'state': 'object',
                     'lat': 'float64',
                     'long': 'float64',
                     'posting_date': 'int64',
                     'VIN_country': 'object',
                     'VIN_manufacturer': 'object',
                     'VIN_model_engine_type': 'object',
                     'VIN_security_code': 'object',
                     'VIN_year': 'object',
                     'VIN_plant': 'object'
                 }
                )

In [2]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

s = (df.dtypes == 'object')
object_cols = list(s[s].index)
low_cardinality_cols = [col for col in object_cols if df[col].nunique() < 10]
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

df[high_cardinality_cols] = ordinal_encoder.fit_transform(df[high_cardinality_cols])

for col in low_cardinality_cols:
    df = onehot_encode(df, col, col+'_')

In [3]:
print(low_cardinality_cols)
print(high_cardinality_cols)

['condition', 'fuel', 'title_status', 'transmission', 'drive']
['paint_color', 'VIN_plant', 'state', 'model', 'region', 'VIN_security_code', 'VIN_year', 'type', 'VIN_model_engine_type', 'manufacturer', 'VIN_country', 'VIN_manufacturer']


# Train Test Split, Scale

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = df.dropna()
df = df.reset_index()

X = df.drop(['price'], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = StandardScaler()

X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [5]:
print(X_train.values)
print(df.info())
print(X_train.info())

[[52670 371.0 2016 ... True False False]
 [36788 42.0 2009 ... False True False]
 [40607 61.0 2013 ... False True False]
 ...
 [16941 369.0 2018 ... True False False]
 [18624 288.0 2017 ... True False False]
 [8763 373.0 2018 ... True False False]]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57030 entries, 0 to 57029
Data columns (total 43 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     57030 non-null  int64  
 1   region                    57030 non-null  float64
 2   price                     57030 non-null  int32  
 3   year                      57030 non-null  Int16  
 4   manufacturer              57030 non-null  float64
 5   model                     57030 non-null  float64
 6   cylinders                 57030 non-null  Int16  
 7   odometer                  57030 non-null  Int32  
 8   type                      57030 non-null  float64
 9   paint_color               57030 

In [6]:
X_train.head()

Unnamed: 0,index,region,year,manufacturer,model,cylinders,odometer,type,paint_color,state,...,title_status__missing,title_status__parts only,title_status__rebuilt,title_status__salvage,transmission__automatic,transmission__manual,transmission__other,drive__4wd,drive__fwd,drive__rwd
52384,52670,371.0,2016,36.0,4097.0,4,85875,0.0,9.0,46.0,...,False,False,False,False,True,False,False,True,False,False
36577,36788,42.0,2009,8.0,4835.0,6,120000,9.0,3.0,34.0,...,False,False,False,False,True,False,False,False,True,False
40375,40607,61.0,2013,13.0,3002.0,4,114478,9.0,10.0,35.0,...,False,False,False,False,True,False,False,False,True,False
56634,56944,213.0,2014,7.0,3733.0,4,81597,9.0,5.0,48.0,...,False,False,False,False,True,False,False,False,True,False
2359,2368,360.0,2004,23.0,4659.0,6,183912,0.0,9.0,3.0,...,False,False,False,False,True,False,False,False,True,False


# TensorFlow

import tensorflow as tf

X_shape = X_train.shape[1]

inputs = tf.keras.Input(shape=(X_shape,))
hidden = tf.keras.layers.Dense(64, activation='relu')(inputs)
hidden = tf.keras.layers.Dense(64, activation='relu')(hidden)
outputs = tf.keras.layers.Dense(1, activation='linear')(hidden)

tf_model = tf.keras.Model(inputs, outputs)

tf_model.compile(
    optimizer='adam',
    loss='mse'
)

history = tf_model.fit(
    X_train,
    y_train, 
    validation_split=0.12,
    batch_size=32,
    epochs=10
)

tf_rmse = np.sqrt(tf_model.evaluate(X_test, y_test))

# PyTorch

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch_X_train = torch.tensor(X_train.values.astype(np.float32)).type(torch.float32)
torch_X_test = torch.tensor(X_test.values.astype(np.float32)).type(torch.float32)
torch_y_train = torch.tensor(np.array(y_train)).type(torch.float32)
torch_y_test = torch.tensor(np.array(y_test)).type(torch.float32)

shape = torch_X_train.shape[1]

class SimpleNet(nn.Module):
    def __init__(self, size):
        super(SimpleNet, self).__init__()
        self.layer1 = nn.Linear(size, 64)
        self.layer2 = nn.Linear(64, 64)
        self.out = nn.Linear(64, 1)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.out(x)
        return x

simple_net = SimpleNet(torch_X_train.shape[1])

optimizer = torch.optim.Adam(simple_net.parameters(), lr=0.01)
criterion = nn.MSELoss()

for x, target in zip(torch_X_train, torch_y_train):
    optimizer.zero_grad()
    output = simple_net(x)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step
    
total_loss = 0

for x, target in zip(torch_X_test, torch_y_test):
    output = simple_net(x)
    loss = criterion(output, target)
    total_loss += loss
    
avg_loss = total_loss/len(torch_X_test)

torch_rmse = torch.sqrt(avg_loss).detach().numpy()

  return torch._C._cuda_getDeviceCount() > 0
  return F.mse_loss(input, target, reduction=self.reduction)


# Results

In [8]:
#print('TensorFlow RMSE: ', tf_rmse)
print('   PyTorch RMSE: ', torch_rmse)

   PyTorch RMSE:  30793.465
