In [1]:
import datetime as dt
import torch
import torch.nn as nn
import pandas as pd

connection_string = "postgresql://postgres:postgres@localhost:5432/property_db"

## Insert example data to database
https://www.kaggle.com/datasets/dawidcegielski/house-prices-in-poland

In [5]:
housing = pd.read_csv('Houses.csv', encoding='latin1', index_col=0)
display(housing)

Unnamed: 0,address,city,floor,id,latitude,longitude,price,rooms,sq,year
0,Podgórze Zab³ocie Stanis³awa Klimeckiego,Kraków,2.0,23918.0,50.049224,19.970379,749000.0,3.0,74.05,2021.0
1,Praga-Po³udnie Grochowska,Warszawa,3.0,17828.0,52.249775,21.106886,240548.0,1.0,24.38,2021.0
2,Krowodrza Czarnowiejska,Kraków,2.0,22784.0,50.066964,19.920025,427000.0,2.0,37.00,1970.0
3,Grunwald,Poznañ,2.0,4315.0,52.404212,16.882542,1290000.0,5.0,166.00,1935.0
4,Ochota Gotowy budynek. Stan deweloperski. Osta...,Warszawa,1.0,11770.0,52.212225,20.972630,996000.0,5.0,105.00,2020.0
...,...,...,...,...,...,...,...,...,...,...
23759,Stare Miasto Naramowice,Poznañ,0.0,3976.0,52.449649,16.949408,543000.0,4.0,77.00,2020.0
23760,W³ochy,Warszawa,4.0,10206.0,52.186109,20.948438,910000.0,3.0,71.00,2017.0
23761,Nowe Miasto Malta ul. Katowicka,Poznañ,0.0,4952.0,52.397345,16.961939,430695.0,3.0,50.67,2022.0
23762,Podgórze Duchackie Walerego S³awka,Kraków,6.0,24148.0,50.024231,19.959569,359000.0,2.0,38.86,2021.0


In [6]:
housing["region"] = housing["address"].map(lambda x: x.split(" ")[0])
housing["property_url"] = "https://www.kaggle.com/datasets/dawidcegielski/house-prices-in-poland/" + housing["id"].astype(str)
housing["price_per_square_meter"] = housing["price"] / housing["sq"]
housing.rename(columns={"sq": "area", "year": "year_built", "price": "total_price", "address": "title"}, inplace=True)
housing.drop(columns=["id"], inplace=True)
housing["date_added"] = dt.datetime(2021, 2, 1)
display(housing)

Unnamed: 0,title,city,floor,latitude,longitude,total_price,rooms,area,year_built,region,property_url,price_per_square_meter,date_added
0,Podgórze Zab³ocie Stanis³awa Klimeckiego,Kraków,2.0,50.049224,19.970379,749000.0,3.0,74.05,2021.0,Podgórze,https://www.kaggle.com/datasets/dawidcegielski...,10114.787306,2021-02-01
1,Praga-Po³udnie Grochowska,Warszawa,3.0,52.249775,21.106886,240548.0,1.0,24.38,2021.0,Praga-Po³udnie,https://www.kaggle.com/datasets/dawidcegielski...,9866.611977,2021-02-01
2,Krowodrza Czarnowiejska,Kraków,2.0,50.066964,19.920025,427000.0,2.0,37.00,1970.0,Krowodrza,https://www.kaggle.com/datasets/dawidcegielski...,11540.540541,2021-02-01
3,Grunwald,Poznañ,2.0,52.404212,16.882542,1290000.0,5.0,166.00,1935.0,Grunwald,https://www.kaggle.com/datasets/dawidcegielski...,7771.084337,2021-02-01
4,Ochota Gotowy budynek. Stan deweloperski. Osta...,Warszawa,1.0,52.212225,20.972630,996000.0,5.0,105.00,2020.0,Ochota,https://www.kaggle.com/datasets/dawidcegielski...,9485.714286,2021-02-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23759,Stare Miasto Naramowice,Poznañ,0.0,52.449649,16.949408,543000.0,4.0,77.00,2020.0,Stare,https://www.kaggle.com/datasets/dawidcegielski...,7051.948052,2021-02-01
23760,W³ochy,Warszawa,4.0,52.186109,20.948438,910000.0,3.0,71.00,2017.0,W³ochy,https://www.kaggle.com/datasets/dawidcegielski...,12816.901408,2021-02-01
23761,Nowe Miasto Malta ul. Katowicka,Poznañ,0.0,52.397345,16.961939,430695.0,3.0,50.67,2022.0,Nowe,https://www.kaggle.com/datasets/dawidcegielski...,8500.000000,2021-02-01
23762,Podgórze Duchackie Walerego S³awka,Kraków,6.0,50.024231,19.959569,359000.0,2.0,38.86,2021.0,Podgórze,https://www.kaggle.com/datasets/dawidcegielski...,9238.291302,2021-02-01


In [8]:
housing.to_sql("property_info", connection_string, schema="property_store", if_exists="append", index=False)

764

## Train model

In [2]:
import os
import datetime as dt
from pathlib import Path
from model import CustomNet
from feast import FeatureStore
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

from train import train_net, get_mae
from transform import get_ohe_encoding, normalize

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

fs = FeatureStore(repo_path="feature_repo_local")
queries_path = Path(os.getcwd()) / "queries"

Using device: cuda


In [3]:
from utils import load_query, load_config

config = load_config("cfgs/base_model.json")
entity_sql = load_query(queries_path / config["query_path"], **config["query_params"])
training_job = fs.get_historical_features(
    entity_sql, 
    features=config["model_metadata"]["features"],
)
training_df = training_job.to_df()
display(training_df)

  df = pd.read_sql(


Unnamed: 0,property_id,event_timestamp,city,region,floor,area,year_built,rooms,total_price
0,1,2024-09-01 22:01:06.548255,Kraków,Podgórze,2,74,2021,3,749000
1,2,2024-09-01 22:01:06.548255,Warszawa,Praga-Po³udnie,3,24,2021,1,240548
2,3,2024-09-01 22:01:06.548255,Kraków,Krowodrza,2,37,1970,2,427000
3,4,2024-09-01 22:01:06.548255,Poznañ,Grunwald,2,166,1935,5,1290000
4,5,2024-09-01 22:01:06.548255,Warszawa,Ochota,1,105,2020,5,996000
...,...,...,...,...,...,...,...,...,...
23759,23760,2024-09-01 22:01:06.548255,Poznañ,Stare,0,77,2020,4,543000
23760,23761,2024-09-01 22:01:06.548255,Warszawa,W³ochy,4,71,2017,3,910000
23761,23762,2024-09-01 22:01:06.548255,Poznañ,Nowe,0,51,2022,3,430695
23762,23763,2024-09-01 22:01:06.548255,Kraków,Podgórze,6,39,2021,2,359000


In [4]:
training_df = training_df.drop(columns=["event_timestamp", "property_id"])
training_df, ohe_encoder = get_ohe_encoding(training_df, config["model_metadata"]["ohe_features"])
training_df, scaler = normalize(training_df, config["model_metadata"]["numerical_features"])
training_df = training_df.astype("float32")
display(training_df)

Unnamed: 0,total_price,city_Kraków,city_Poznañ,city_Warszawa,region_Bemowo,region_Bia³o³êka,region_Bielany,region_Bie¿anów-Prokocim,region_Bieñczyce,region_Bronowice,...,region_mazowieckie,region_ma³opolskie,region_wielkopolskie,region_ródmiecie,region_£agiewniki-Borek,region_¯oliborz,floor,area,year_built,rooms
0,749000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.328302,-0.004400,0.423257,0.380557
1,240548.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.077638,-0.012053,0.423257,-1.626448
2,427000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.328302,-0.010063,-0.632359,-0.622945
3,1290000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.328302,0.009681,-1.356802,2.387563
4,996000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.734243,0.000345,0.402559,2.387563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23759,543000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.140183,-0.003941,0.402559,1.384060
23760,910000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.483579,-0.004859,0.340464,0.380557
23761,430695.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.140183,-0.007920,0.443956,0.380557
23762,359000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.295460,-0.009757,0.423257,-0.622945


In [5]:
train_df, test_df = train_test_split(training_df, test_size=0.2, random_state=42)
X_train = train_df.drop(columns=config["model_metadata"]["target"])
y_train = train_df[config["model_metadata"]["target"]]
train_loader = DataLoader(
    TensorDataset(
        torch.from_numpy(X_train.values).float().to(device),
        torch.from_numpy(y_train.values).float().to(device),
    ),
    batch_size=32,
    shuffle=True
)

test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
X_test = test_df.drop(columns=config["model_metadata"]["target"])
y_test = test_df[config["model_metadata"]["target"]]
X_val = val_df.drop(columns=config["model_metadata"]["target"])
y_val = val_df[config["model_metadata"]["target"]]
test_loader = DataLoader(
    TensorDataset(
        torch.from_numpy(X_test.values).float().to(device),
        torch.from_numpy(y_test.values).float().to(device),
    ),
    batch_size=32,
    shuffle=False
)
val_loader = DataLoader(
    TensorDataset(
        torch.from_numpy(X_val.values).float().to(device),
        torch.from_numpy(y_val.values).float().to(device),
    ),
    batch_size=32,
    shuffle=False
)

In [6]:
net = CustomNet(X_train.shape[1], 64, 3, 1).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

for i, (train_loss, eval) in enumerate(train_net(net, train_loader, val_loader, optimizer, criterion, get_mae, epochs=100, device=device, verbose=False)):
    if i % 10 == 0:
        print(f"Epoch: {i+1}, Training loss: {train_loss:.4f}, Validation MAE: {eval:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1, Training loss: 717815061853.3647, Validation MAE: 649125985.2800
Epoch: 11, Training loss: 712650554676.0605, Validation MAE: 643778082.4533
Epoch: 21, Training loss: 697701538834.9312, Validation MAE: 632686612.2667
Epoch: 31, Training loss: 677542213256.8202, Validation MAE: 616938899.2000
Epoch: 41, Training loss: 651291060089.7614, Validation MAE: 593933754.9867
Epoch: 51, Training loss: 620970515777.8286, Validation MAE: 568923762.0267
Epoch: 61, Training loss: 585842457467.4823, Validation MAE: 538637174.9333
Epoch: 71, Training loss: 549058920943.6504, Validation MAE: 499474856.9067
Epoch: 81, Training loss: 509410255770.4605, Validation MAE: 457090098.3467
Epoch: 91, Training loss: 469848826695.8521, Validation MAE: 418149280.0000


In [7]:
X, y = next(iter(test_loader))
y_pred = net(X)
print(f"Predicted: {y_pred[0].item()}, Actual: {y[0].item()}")

Predicted: 196827.125, Actual: 850000.0


In [9]:
from utils import save_model

save_model(net, config, ohe_encoder, None, scaler, connection_string=connection_string)

ProgrammingError: (psycopg2.ProgrammingError) can't adapt type 'OneHotEncoder'
[SQL: INSERT INTO model_store.models (model_name, model_version, model_description, model_metadata, model_binary, model_ohe_encoder, model_scaler) VALUES (%(model_name)s, %(model_version)s, %(model_description)s, %(model_metadata)s, %(model_binary)s, %(model_ohe_encoder)s, %(model_scaler)s)]
[parameters: {'model_name': 'base_model', 'model_version': '1.0', 'model_description': 'This is a base model for testing purposes', 'model_metadata': '{"features": ["property_info:city", "property_info:region", "property_info:floor", "property_info:area", "property_info:year_built", "property_info:r ... (40 characters truncated) ... rget": "total_price", "ohe_features": ["city", "region"], "categorical_features": [], "numerical_features": ["floor", "area", "year_built", "rooms"]}', 'model_binary': b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x14\x00model/data.pklFB\x10\x00ZZZZZZZZZZ ... (206458 characters truncated) ... K\x06\x07\x00\x00\x00\x00o!\x01\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x1d\x00\x1d\x00\xb7\x06\x00\x00\xb8\x1a\x01\x00\x00\x00', 'model_ohe_encoder': OneHotEncoder(), 'model_scaler': StandardScaler()}]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
from utils import load_model_from_directory

model, config, ohe_encoder, _, scaler = load_model_from_directory(config['model_name'], config['model_version'], Path(os.getcwd()) / "artifacts")
X, y = next(iter(test_loader))
y_pred = model(X)
print(f"Predicted: {y_pred[0].item()}, Actual: {y[0].item()}")