In [None]:
import psycopg2
from psycopg2.extensions import connection
from mes_proto_python.proto import offers_pb2
from google.protobuf.json_format import Parse
from typing import List

def read_proto_from_db(connection: connection) -> List[offers_pb2.Offer]:
    result = []
    try:
        with connection.cursor() as cursor:
            query = """SELECT details::TEXT FROM store.offers;"""
            cursor.execute(query)
            rows = cursor.fetchall()
            for row in rows:
                new_offer = offers_pb2.Offer()
                Parse(row[0], new_offer)
                result.append(new_offer)
    except psycopg2.Error as e:
        print(f"Error: {e}")
    finally:
        connection.close()
    return result


In [None]:
from typing import Dict, Any
import pandas as pd

def decimal_number_to_float(price: offers_pb2.Money):
    if price.value < 1000:
        price.value
    return price.value * 10 ** -price.scale


def offer_to_flat_dict(offer: offers_pb2.Offer) -> Dict[str, Any]:
    raw_price = decimal_number_to_float(offer.apartment.price.value)
    raw_size = decimal_number_to_float(offer.apartment.size)
    return {
        'title': offer.title,
        'description': offer.description,
        'time_scraped': offer.time_scraped.ToDatetime(),
        'price': raw_price if offer.apartment.price.value.value >= 1000 else raw_price / 100,
        'size': raw_size if raw_size > 5 else raw_size * 100,
        'address': offer.apartment.address,
        'latitude': offer.apartment.location.latitude,
        'longitude': offer.apartment.location.longitude,
        'year_built': offer.apartment.year_built if offer.apartment.year_built != 0 else None,
        'room_count': offer.apartment.room_count,
        'floor': offer.apartment.floor,
    }


def proto_list_to_dict_list(list: List[offers_pb2.Offer]):
    result = []
    for offer in list:
        try:
            result.append(offer_to_flat_dict(offer))
        except Exception as e:
            print(f"Error processing offer: {e}")
            continue  # This will move to the next item in the loop
    return result


def proto_list_to_dataframe(list: List[offers_pb2.Offer]): 
    return pd.DataFrame(proto_list_to_dict_list(list))

In [None]:
import matplotlib.pyplot as plt

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [price]')
  plt.legend()
  plt.grid(True)

In [None]:
import psycopg2
from mes_proto_python.proto import offers_pb2
from google.protobuf.json_format import Parse

connection = psycopg2.connect(
    dbname='admin',
    user='admin',
    password='mysecretpassword',
    host='localhost',
    port='6020'
)

In [None]:
offers_proto = read_proto_from_db(connection)

In [None]:
offers_dataframe = proto_list_to_dataframe(offers_proto)

In [None]:
import numpy as np

def clear_dataframe(offers_dataframe):
    warsaw_only = offers_dataframe[((offers_dataframe['latitude'].between(52, 53.2)) & (offers_dataframe['longitude'].between(20, 22)))]
    only_numeric_values = warsaw_only.drop(['time_scraped', 'title', 'description', 'address'], axis=1)
    only_numeric_values.loc[only_numeric_values['room_count'] > 50, 'room_count'] = np.nan
    only_numeric_values.loc[only_numeric_values['room_count'] == 0, 'room_count'] = np.nan
    only_numeric_values.loc[only_numeric_values['year_built'] < 1800, 'year_built'] = np.nan
    for col in only_numeric_values.columns:
        only_numeric_values[col + '_is_nan'] = only_numeric_values[col].isna().astype(int)
    return only_numeric_values

def clear_dataframe2(offers_dataframe):
    clear = clear_dataframe(offers_dataframe)
    return clear[clear['price'] <= 10_000_000]

In [None]:
import seaborn as sns

clean_dataframe = clear_dataframe(offers_dataframe)
sns.pairplot(clean_dataframe[['latitude', 'longitude', 'size', 'price', 'year_built', 'room_count', 'floor']], diag_kind='kde')

In [None]:
def split(dataframe):
    train_dataset = dataframe.sample(frac=0.8, random_state=0)
    test_dataset = dataframe.drop(train_dataset.index)

    train_features = train_dataset.copy()
    test_features = test_dataset.copy()

    train_label = train_features.pop('price')
    test_label = test_features.pop('price')

    return train_features, train_label, train_dataset, test_features, test_label, test_dataset


train_features, train_label, train_dataset, test_features, test_label, test_dataset = split(clean_dataframe)

In [None]:

import tensorflow as tf
from tensorflow.keras import backend as K

def r_squared(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return (1 - SS_res/(SS_tot + K.epsilon()))

metrics=[
    'mae',
    'mape',
    r_squared,
    tf.metrics.RootMeanSquaredError(),
]

# Linear Regression

In [None]:
import tensorflow as tf

class MeanImputerLayer(tf.keras.layers.Layer):
    def __init__(self, data, **kwargs):
        super(MeanImputerLayer, self).__init__(**kwargs)
        mean_data = data.mean().values
        self.mean_values = tf.Variable(mean_data, trainable=False, dtype=tf.float32)

    def call(self, inputs):
        return tf.where(tf.math.is_nan(inputs), self.mean_values, inputs)


In [None]:
def train_model(train_features, train_label, optimizer, loss_function, epochs, metrics):
    linear_model = tf.keras.Sequential([
        MeanImputerLayer(train_features),
        tf.keras.layers.Dense(units=1)
    ])

    linear_model.compile(optimizer=optimizer, loss=loss_function, metrics=metrics)

    linear_model_history = linear_model.fit(
        train_features,
        train_label,
        epochs=epochs,
        verbose=0,
        validation_split = 0.2)
    
    return linear_model, linear_model_history


def evaluate(model, test_features, test_label):
    loss, *metrics = model.evaluate(test_features, test_label)
    result = {
        'loss': loss,
        'mae': metrics[0],
        'mape': metrics[1],
        'r_squared': metrics[2],
        'root_mean_squared_error': metrics[3]
    }
    return result


In [None]:
optimizer = tf.keras.optimizers.legacy.Adam
loss_functions = [
    'mean_squared_error',
    'mean_absolute_error'
]
epochs = [
    2_000,
    5_000,
    10_000
]
learning_rates = [
    0.05,
    0.2,
    0.4,
    0.5
]

results_lr = {}

for loss_function in loss_functions:
    for epoch in epochs:
        for learning_rate in learning_rates:
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            model, _ = train_model(train_features.copy(), train_label.copy(), optimizer(learning_rate=learning_rate), loss_function, epoch, metrics)
            print(optimizer, loss_function, epoch, learning_rate)
            results_lr[f"{optimizer}, {loss_function}, {epoch}, {learning_rate}"] = evaluate(model, test_features, test_label)
            print('\n\n\n')

In [None]:
sorted_keys_lr = sorted(results_lr, key=lambda k: results_lr[k]['mae'])
for key in sorted_keys_lr:
    print(f"Model: {key}")
    for metric, value in results_lr[key].items():
        print(f"{metric}: {value}")
    print("-----------------")

# DNN Regression

In [None]:
def train_model_dnn(train_features, train_label, optimizer, loss_function, epochs, metrics, activation):
    linear_model_dnn = tf.keras.Sequential([
        MeanImputerLayer(train_features),
        tf.keras.layers.Dense(128, activation=activation),
        tf.keras.layers.Dense(128, activation=activation),
        tf.keras.layers.Dense(128, activation=activation),
        tf.keras.layers.Dense(units=1)
    ])

    linear_model_dnn.compile(optimizer=optimizer, loss=loss_function, metrics=metrics)

    linear_model_history = linear_model_dnn.fit(
        train_features,
        train_label,
        epochs=epochs,
        verbose=0,
        validation_split = 0.2)
    
    return linear_model_dnn, linear_model_history


def evaluate_dnn(model, test_features, test_label):
    loss, *metrics = model.evaluate(test_features, test_label)
    result = {
        'loss': loss,
        'mae': metrics[0],
        'mape': metrics[1],
        'r_squared': metrics[2],
        'root_mean_squared_error': metrics[3]
    }
    return result


In [None]:
optimizers = [
    # tf.keras.optimizers.legacy.SGD,
    tf.keras.optimizers.legacy.RMSprop,
    tf.keras.optimizers.legacy.Adam,
    tf.keras.optimizers.legacy.Adagrad,
    # tf.keras.optimizers.legacy.Adadelta,
    # tf.keras.optimizers.legacy.Ftrl
]
loss_functions = [
    'mean_squared_error',
    'mean_absolute_error'
]
epochs = [
    1_000,
    2_000,
    5_000,
    10_000
]
learning_rates = [
    0.01,
    0.1,
    0.3,
    0.5
]
activation_functions = [
    "elu",
    "exponential",
    "relu",
    "selu",
    "sigmoid",
    "softmax",
    "softplus",
    "softsign",
    "tanh",
    "swish"
]

results_dnn = {}

for loss_function in loss_functions:
    for optimizer in optimizers:
        for epoch in epochs:
            for activation_function in activation_functions:
                for learning_rate in learning_rates:
                    print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                    model, _ = train_model_dnn(train_features.copy(), train_label.copy(), optimizer(learning_rate=learning_rate), loss_function, epoch, metrics, activation_function)
                    print(optimizer, loss_function, epoch, learning_rate)
                    results_dnn[f"{optimizer}, {loss_function}, {epoch}, {learning_rate} {activation_function}"] = evaluate_dnn(model, test_features.copy(), test_label.copy())
                    print('\n\n\n')


In [None]:
sorted_keys_dnn = sorted(results_dnn, key=lambda k: results_dnn[k]['mae'])
for key in sorted_keys_dnn:
    print(f"Model: {key}")
    for metric, value in results_dnn[key].items():
        print(f"{metric}: {value}")
    print("-----------------")

# Random Forest

In [None]:
import tensorflow_decision_forests as tfdf

def train_model_rf(train_dataset, metrics, num_trees, max_num_nodes, max_depth, split_axis, growing_strategy):
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_dataset, label='price', task=tfdf.keras.Task.REGRESSION)

    model_rf = tfdf.keras.RandomForestModel(
        num_trees=num_trees,
        max_depth=max_depth,
        max_num_nodes=max_num_nodes,
        split_axis=split_axis,
        growing_strategy=growing_strategy,
        task=tfdf.keras.Task.REGRESSION,
    )

    model_rf.compile(metrics=metrics)

    linear_model_history = model_rf.fit(train_ds, verbose=0)
    
    return model_rf, linear_model_history


def evaluate_rf(model, test_dataset):
    test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_dataset, label='price', task=tfdf.keras.Task.REGRESSION)
    loss, *metrics = model.evaluate(test_ds)
    result = {
        'loss': loss,
        'mae': metrics[0],
        'mape': metrics[1],
        'r_squared': metrics[2],
        'root_mean_squared_error': metrics[3]
    }
    return result

In [None]:
nums_trees = [100, 300, 900, 1200, 2000] 
max_depths = [4, 8, 16, 32, 64, 128, 512, 1024, 256] # growing_strategy = LOCAL
max_nums_nodes = [4, 16, 64, 128, 256, 512, 1024, 2048, 5096] # growing_strategy = BEST_FIRST_GLOBAL
splits_axis = ['AXIS_ALIGNED', 'SPARSE_OBLIQUE'] 

results_rf = {}

for num_trees in nums_trees:
    for max_depth in max_depths:
            for split_axis in splits_axis:
                print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                model, _ = train_model_rf(train_dataset.copy(), metrics, num_trees, None, max_depth, split_axis, 'LOCAL')
                results_rf[f"{num_trees}, {max_depth}, {split_axis}, LOCAL"] = evaluate_rf(model, test_dataset.copy())
                print('\n\n\n')
            
for num_trees in nums_trees:
    for max_num_nodes in max_nums_nodes:
            for split_axis in splits_axis:
                print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                model, _ = train_model_rf(train_dataset.copy(), metrics, num_trees, max_num_nodes, None, split_axis, 'BEST_FIRST_GLOBAL')
                results_rf[f"{num_trees}, {max_num_nodes}, {split_axis}, BEST_FIRST_GLOBAL"] = evaluate_rf(model, test_dataset.copy())
                print('\n\n\n')

In [None]:
sorted_keys_rf = sorted(results_rf, key=lambda k: results_rf[k]['mae'])
for key in sorted_keys_rf:
    print(f"Model: {key}")
    for metric, value in results_rf[key].items():
        print(f"{metric}: {value}")
    print("-----------------")

# Gradient Boosted Trees

In [None]:
import tensorflow_decision_forests as tfdf

def train_model_gbt(train_dataset, metrics, num_trees, max_num_nodes, max_depth, split_axis, growing_strategy):
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_dataset, label='price', task=tfdf.keras.Task.REGRESSION)

    model_gbt = tfdf.keras.GradientBoostedTreesModel(
        num_trees=num_trees,
        max_depth=max_depth,
        max_num_nodes=max_num_nodes,
        split_axis=split_axis,
        growing_strategy=growing_strategy,
        task=tfdf.keras.Task.REGRESSION,
    )

    model_gbt.compile(metrics=metrics)

    linear_model_history = model_gbt.fit(train_ds, verbose=0)
    
    return model_gbt, linear_model_history


def evaluate_gbt(model, test_dataset):
    test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_dataset, label='price', task=tfdf.keras.Task.REGRESSION)
    loss, *metrics = model.evaluate(test_ds)
    result = {
        'loss': loss,
        'mae': metrics[0],
        'mape': metrics[1],
        'r_squared': metrics[2],
        'root_mean_squared_error': metrics[3]
    }
    return result

In [None]:
nums_trees = [100, 300, 900, 1200, 2000] 
max_depths = [4, 8, 16, 32, 64, 128, 512, 1024, 256] # growing_strategy = LOCAL
max_nums_nodes = [4, 16, 64, 128, 256, 512, 1024, 2048, 5096] # growing_strategy = BEST_FIRST_GLOBAL
splits_axis = ['AXIS_ALIGNED', 'SPARSE_OBLIQUE'] 

results_gbt = {}

for num_trees in nums_trees:
    for max_depth in max_depths:
            for split_axis in splits_axis:
                print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                model, _ = train_model_gbt(train_dataset.copy(), metrics, num_trees, None, max_depth, split_axis, 'LOCAL')
                results_gbt[f"{num_trees}, {max_depth}, {split_axis}, LOCAL"] = evaluate_gbt(model, test_dataset.copy())
                print('\n\n\n')
            
for num_trees in nums_trees:
    for max_num_nodes in max_nums_nodes:
            for split_axis in splits_axis:
                print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
                model, _ = train_model_gbt(train_dataset.copy(), metrics, num_trees, max_num_nodes, None, split_axis, 'BEST_FIRST_GLOBAL')
                results_gbt[f"{num_trees}, {max_num_nodes}, {split_axis}, BEST_FIRST_GLOBAL"] = evaluate_gbt(model, test_dataset.copy())
                print('\n\n\n')

In [None]:
sorted_keys_gbd = sorted(results_gbt, key=lambda k: results_gbt[k]['r_squared'])
for key in sorted_keys_gbd:
    print(f"Model: {key}")
    for metric, value in results_gbt[key].items():
        print(f"{metric}: {value}")
    print("-----------------")

# K N Neighbors

In [None]:
from sklearn.neighbors import KNeighborsRegressor


def train_model_knn(train_features, train_label, n_neighbors, weights, algorithm, metric):
    train_features.fillna(train_features.mean(), inplace=True)

    knn_regressor = KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        metric=metric,
        algorithm=algorithm
    )
    history = knn_regressor.fit(train_features, train_label)
    
    return knn_regressor, history



In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_knn(model, test_features, test_label):
    test_features.fillna(test_features.mean(), inplace=True)
    y_true = test_label

    y_pred = model.predict(test_features)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    result = {
        'mae': mae,
        'mape': mape,
        'r_squared': r2,
        'root_mean_squared_error': rmse
    }
    return result


In [None]:
result_knn = {}

for n_neighbors in [3, 5, 10, 30, 50]:
    for weights in ['uniform', 'distance']:
        for metric in ['cityblock', 'cosine', 'euclidean', 'haversine', 'nan_euclidean']:
            for algorithm in ['auto', 'ball_tree', 'kd_tree', 'brute']:
                try:
                    model, _ = train_model_knn(train_features.copy(), train_label.copy(), n_neighbors, weights, algorithm, metric)
                    result_knn[f'n_neighbors: {n_neighbors}, weights: {weights}, metric: {metric}, algorithm: {algorithm}'] = \
                        evaluate_knn(model, test_features.copy(), test_label.copy())
                except Exception as e:
                    print('Outcome: error', e)
                    pass

In [None]:
sorted_keys_knn = sorted(result_knn, key=lambda k: result_knn[k]['r_squared'])
for key in sorted_keys_knn:
    print(f"Model: {key}")
    for metric, value in result_knn[key].items():
        print(f"{metric}: {value}")
    print("-----------------")