In [1]:
from sklearn.metrics import mean_pinball_loss
from sklearn.pipeline import Pipeline
import scripts.ml_utils as mlu
import pandas as pd
import numpy as np
import joblib
import pickle
import fs

In [2]:
IMTERIM_DIR = fs.open_fs("../data/interim")
TRAIN_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_train.csv")
TEST_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_test.csv")
VALIDATION_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_val.csv")

In [6]:
NEW_MODELS_DIR = fs.open_fs("../models/new")

In [3]:
train = pd.read_csv(TRAIN_CSV_DIR)
test = pd.read_csv(TEST_CSV_DIR)
validation = pd.read_csv(VALIDATION_CSV_DIR)

In [4]:
# Select variables
feature_names_number = ['horsepower', 'displacement', 'torque', 'wheels', 'km', 'age']
feature_names_category = ['navigation_system', 'rear_sensor', 'push_start', 'turbo', 'body_type']

print('feature_names_number')
print(feature_names_number)
print('feature_names_category')
print(feature_names_category)

feature_names_number
['horsepower', 'displacement', 'torque', 'wheels', 'km', 'age']
feature_names_category
['navigation_system', 'rear_sensor', 'push_start', 'turbo', 'body_type']


In [5]:
X_train = train[feature_names_number + feature_names_category].copy()
y_train = train['price']
X_test = test[feature_names_number + feature_names_category].copy()
y_test = test['price']
X_val = validation[feature_names_number + feature_names_category].copy()
y_val = validation['price']

In [11]:
# load the saved model
CATBOOST_INTERVAL_DIR = NEW_MODELS_DIR.getsyspath('3_catboost_bcu.joblib')
model_pipeline = joblib.load(CATBOOST_INTERVAL_DIR)

In [16]:
# X_train = pd.DataFrame(model_pipeline.fit_transform(X_train))
# X_val = pd.DataFrame(model_pipeline.fit_transform(X_test))
# X_val = pd.DataFrame(model_pipeline.fit_transform(X_val))

In [23]:
y_train_predict = model_pipeline.predict(X_train)
Stats_train = y_train.to_frame(name="y_true") # the "ground truth" column
Stats_train["y_hat"] = y_train_predict
Stats_train

Unnamed: 0,y_true,y_hat
0,451999,423416.877140
1,281999,250151.957514
2,224999,231455.155491
3,171999,177460.656847
4,199999,219824.150187
...,...,...
932,319999,335163.732722
933,161999,162464.210669
934,377999,351408.596896
935,285999,266978.907198


In [25]:
y_test_predict = model_pipeline.predict(X_test)
Stats_test = y_test.to_frame(name="y_true") # the "ground truth" column
Stats_test["y_hat"] = y_test_predict
Stats_test

Unnamed: 0,y_true,y_hat
0,206999,206981.643827
1,192999,193261.758247
2,791999,644131.550521
3,418999,441067.760761
4,210999,214634.861015
...,...,...
463,593999,570587.077398
464,335999,369160.677163
465,285999,266201.872592
466,462999,457468.973702


In [26]:
y_val_predict = model_pipeline.predict(X_val)
Stats_val = y_val.to_frame(name="y_true") # the "ground truth" column
Stats_val["y_hat"] = y_val_predict
Stats_val

Unnamed: 0,y_true,y_hat
0,172999,183895.838819
1,181999,171323.188939
2,390999,343850.792939
3,620999,709239.211448
4,229999,239513.401133
...,...,...
464,360999,326107.464669
465,125999,147843.881879
466,182999,208218.795742
467,206999,181853.033182


In [27]:
interval_range = 50_000
min_price = 100_000
max_price = 1_000_000

In [28]:
def get_df_summary(min_price, max_price, interval_range):
    intervals_start = np.arange(min_price, max_price, interval_range)
    df_start = pd.DataFrame(intervals_start)
    df_start.columns = ['Starting']
    intervals_end = np.arange(min_price + interval_range, max_price + interval_range, interval_range)
    df_end = pd.DataFrame(intervals_end)
    df_end.columns = ['Ending']
    df_summary = pd.concat([df_start, df_end], axis=1)
    return df_summary

In [29]:
def get_count(row, df_values, price_column, sum_column):
    a = row['Starting']
    b = row['Ending']
    # Filtrar los valores entre a y b
    valores_filtrados = df_values[(df_values[price_column] >= a) & (df_values[price_column] <= b)]
    # Calcular el promedio
    count = np.round(valores_filtrados[sum_column].count(), 2)
    return count

In [30]:
def get_avg_error(row, df_values, price_column, sum_column):
    a = row['Starting']
    b = row['Ending']
    # Filtrar los valores entre a y b
    valores_filtrados = df_values[(df_values[price_column] >= a) & (df_values[price_column] <= b)]
    # Calcular el promedio
    promedio = np.round(valores_filtrados[sum_column].mean(), 2)
    return promedio

In [65]:
def get_MAE_error(row, df_values, ytrue_column, yhat_column):
    a = row['Starting']
    b = row['Ending']
    # Filtrar los valores entre a y b
    valores_filtrados = df_values[(df_values[ytrue_column] >= a) & (df_values[ytrue_column] <= b)]
    # Calcular el MAE
    # df_metrics = valores_filtrados[yhat_column] - row[ytrue_column]
    mae = np.round(np.mean(np.abs(valores_filtrados[yhat_column] - valores_filtrados[ytrue_column])), 2)
    # mae = np.round(valores_filtrados[sum_column].mean(), 2)
    return mae

In [67]:
df_sum = get_df_summary(min_price, max_price, interval_range)
df_sum["MAE"] = df_sum.apply(lambda row: get_MAE_error(row, Stats_train, ytrue_column = 'y_true', yhat_column = 'y_hat'), axis=1)
df_sum

Unnamed: 0,Starting,Ending,MAE
0,100000,150000,12280.9
1,150000,200000,11295.4
2,200000,250000,14471.37
3,250000,300000,15923.55
4,300000,350000,19016.91
5,350000,400000,19627.39
6,400000,450000,19575.01
7,450000,500000,22948.06
8,500000,550000,27602.84
9,550000,600000,21962.87


In [70]:
df_sum = get_df_summary(min_price, max_price, interval_range)
df_sum["MAE"] = df_sum.apply(lambda row: get_MAE_error(row, Stats_test, ytrue_column = 'y_true', yhat_column = 'y_hat'), axis=1)
df_sum

Unnamed: 0,Starting,Ending,MAE
0,100000,150000,23178.85
1,150000,200000,11851.14
2,200000,250000,18932.56
3,250000,300000,26155.64
4,300000,350000,32709.03
5,350000,400000,29156.86
6,400000,450000,56976.87
7,450000,500000,58808.74
8,500000,550000,58980.25
9,550000,600000,56668.19


In [72]:
list(np.round(df_sum["MAE"]/1000, 2))

[23.18,
 11.85,
 18.93,
 26.16,
 32.71,
 29.16,
 56.98,
 58.81,
 58.98,
 56.67,
 45.34,
 151.66,
 87.92,
 118.09,
 194.25,
 nan,
 186.36,
 246.3]

In [68]:
df_sum = get_df_summary(min_price, max_price, interval_range)
df_sum["MAE"] = df_sum.apply(lambda row: get_MAE_error(row, Stats_val, ytrue_column = 'y_true', yhat_column = 'y_hat'), axis=1)
df_sum

Unnamed: 0,Starting,Ending,MAE
0,100000,150000,23012.8
1,150000,200000,15427.8
2,200000,250000,16663.9
3,250000,300000,25120.38
4,300000,350000,27010.65
5,350000,400000,54880.49
6,400000,450000,65724.63
7,450000,500000,61092.66
8,500000,550000,95622.19
9,550000,600000,51070.75
