In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'Using GPU: {torch.cuda.get_device_name()}')
else:
    device = torch.device('cpu')
    print('Using CPU')

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100000)

## Зареждане на данните

In [None]:
dataset = pd.read_csv('../data/prepared-car-offers.csv')

In [None]:
dataset.sample(20)

In [None]:
dataset = dataset.drop(columns='Region')

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, TargetEncoder
from sklearn.model_selection import train_test_split

In [None]:
max_abs_scaling_num_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler())
])

ohe_cat_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

columns_for_target_encoding = dataset[dataset.select_dtypes(include=object).columns].nunique()[dataset.nunique() > 6].index
target_ohe_enc_cat_preprocessor = ColumnTransformer([
    ('target_encoding', TargetEncoder(target_type='continuous', smooth=0.2), columns_for_target_encoding),
    ('one_hot_encoding', ohe_cat_preprocessor, make_column_selector(dtype_include=object))
], remainder='passthrough')

default_preprocessor = ColumnTransformer([
    ('categorical', target_ohe_enc_cat_preprocessor, make_column_selector(dtype_include=object)),
    ('numerical', max_abs_scaling_num_preprocessor, make_column_selector(dtype_include=np.number)),
], remainder='passthrough')

In [None]:
y = dataset['Price']
X = dataset.drop(columns='Price')

X = default_preprocessor.fit_transform(X, y)

n_features = X.shape[1]
n_features

In [None]:
from torch import nn, optim
import copy

In [None]:
class CarPriceModel(nn.Module):
    
    def __init__(self, n_features, n_hidden=256):
        super(CarPriceModel, self).__init__()
        self.fc1 = nn.Linear(n_features, n_hidden)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(n_hidden, 1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

In [None]:
model = CarPriceModel(n_features)
model.to(device)

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train.todense(), dtype=torch.float32).to(device)
X_test = torch.tensor(X_test.todense(), dtype=torch.float32).to(device)
y_train = torch.Tensor(y_train.to_numpy(dtype=np.float32)).reshape(-1, 1).to(device)
y_test = torch.Tensor(y_test.to_numpy(dtype=np.float32)).reshape(-1, 1).to(device)

n_epochs = 50
batch_size = 1000
batches_per_epoch = len(X_train) // batch_size
batch_start = torch.arange(0, len(X_train), batch_size)

In [None]:
best_mse = np.inf
best_weights = None
history = []

for epoch in range(n_epochs):
    model.train()
    for i in range(batches_per_epoch):
        start = i * batch_size
        
        # take a batch
        X_batch = X_train[start:start+batch_size]
        y_batch = y_train[start:start+batch_size]
        
        # forward pass
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # update weights
        optimizer.step()
            
    # evaluate accuracy at end of each epoch
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

# restore model and return best accuracy
model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.plot(history)
plt.show()