In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor
from functions import *

In [9]:
#apartment rental data
price_data_array : list = [
    './data/apartments_pl_2023_08.csv',
    './data/apartments_pl_2023_09.csv', 
    './data/apartments_pl_2023_10.csv',
    './data/apartments_pl_2023_11.csv', 
    './data/apartments_pl_2023_12.csv',
    './data/apartments_pl_2024_01.csv', 
    './data/apartments_pl_2024_02.csv',
    './data/apartments_pl_2024_03.csv', 
    './data/apartments_pl_2024_04.csv' 
]

price_data_array_rent : list = [
    './data/apartments_rent_pl_2023_11.csv', 
    './data/apartments_rent_pl_2023_12.csv',
    './data/apartments_rent_pl_2024_01.csv', 
    './data/apartments_rent_pl_2024_02.csv',
    './data/apartments_rent_pl_2024_03.csv', 
    './data/apartments_rent_pl_2024_04.csv' 
]

data = pd.concat([pd.read_csv(data_set)for data_set in price_data_array])
data.head()

Unnamed: 0,id,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
0,f8524536d4b09a0c8ccc0197ec9d7bde,szczecin,blockOfFlats,63.0,3.0,4.0,10.0,1980.0,53.378933,14.625296,...,0.413,condominium,concreteSlab,,yes,yes,yes,no,yes,415000
1,accbe77d4b360fea9735f138a50608dd,szczecin,blockOfFlats,36.0,2.0,8.0,10.0,,53.442692,14.55969,...,0.205,cooperative,concreteSlab,,no,yes,yes,no,yes,395995
2,8373aa373dbc3fe7ca3b7434166b8766,szczecin,tenement,73.02,3.0,2.0,3.0,,53.452222,14.553333,...,0.28,condominium,brick,,no,no,no,no,no,565000
3,0a68cd14c44ec5140143ece75d739535,szczecin,tenement,87.6,3.0,2.0,3.0,,53.4351,14.5329,...,0.087,condominium,brick,,yes,yes,no,no,yes,640000
4,f66320e153c2441edc0fe293b54c8aeb,szczecin,blockOfFlats,66.0,3.0,1.0,3.0,,53.410278,14.503611,...,0.514,condominium,,,no,no,no,no,no,759000


In [10]:
data = data.drop('id', axis=1)

numerical_columns = ['squareMeters', 'rooms', 'floor', 'floorCount', 'buildYear', 'latitude', 'longitude', 'centreDistance', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance']
categorical_columns = ['city', 'type', 'ownership', 'buildingMaterial', 'condition']
boolean_columns = ['hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom']
drop_columns = []
output_column = 'price'


data = data.drop(drop_columns, axis=1)

fill_na(data, numerical_columns, 'mean')
fill_na(data, boolean_columns, 'false')

data = pd.get_dummies(data, columns=categorical_columns)
data = pd.get_dummies(data, columns=boolean_columns, drop_first=True).astype(int)

normalize_numerical_columns(data, numerical_columns)

In [11]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['price'] >= lower_bound)&(data['price'] <= upper_bound)]

In [12]:
X = data.drop('price', axis=1)
Y = data['price']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_validate, Y_train, Y_validate = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = np.array(X_train, dtype=np.float32)
X_test  = np.array(X_test, dtype=np.float32)
X_validate = np.array(X_validate, dtype=np.float32)
Y_train = np.array(Y_train, dtype=np.float32).reshape(-1, 1)
Y_test  = np.array(Y_test, dtype=np.float32).reshape(-1, 1)
Y_validate  = np.array(Y_validate, dtype=np.float32).reshape(-1, 1)

In [13]:
tab_regressor = TabNetRegressor(
    n_d=64,
    n_a=64,
    n_steps = 5,
    gamma=1.5,
    lambda_sparse=0,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    scheduler_params={"step_size": 10, "gamma": 0.9}
)



In [14]:
tab_regressor.fit(
    X_train=X_train, y_train=Y_train,
    eval_set=[(X_validate, Y_validate), (X_train, Y_train)],
    max_epochs=15,
    patience=20,
    batch_size=128, 
    virtual_batch_size=16,
    num_workers=0,
    drop_last=False
)

epoch 0  | loss: 473369834339.98| val_0_mse: 300008243200.0| val_1_mse: 300252495872.0|  0:01:32s
epoch 1  | loss: 178067117757.56345| val_0_mse: 52504137728.0| val_1_mse: 52543705088.0|  0:03:09s
epoch 2  | loss: 40723104860.37562| val_0_mse: 19863191552.0| val_1_mse: 20014555136.0|  0:04:45s
epoch 3  | loss: 24665572242.06246| val_0_mse: 24988045312.0| val_1_mse: 24892733440.0|  0:06:17s
epoch 4  | loss: 21585620865.97568| val_0_mse: 16854514688.0| val_1_mse: 16822658048.0|  0:07:48s


KeyboardInterrupt: 

In [None]:
y_pred = tab_regressor.predict(Y_test)

evaluate_model(Y_test, y_pred)
notify(1000)