In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import time
from sklearn.metrics import max_error,r2_score,mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from utilsFeatureSelection import filterPriceData
import pickle

In [2]:
# Import test data
df_test = pd.read_csv('../../data/test-data/price-example.csv', sep=';')
df_test = filterPriceData(df_test, 1)
df_test

1 - Info True Across Multiple Platforms

2 - Info Produced by Vivino

3 - Info Produced by Vivino Users



Unnamed: 0,price_amount,vintage_year,wine_region_country_code,wine_region_seo_name,wine_winery_seo_name,wine_seo_name,wine_type_id,wine_is_natural,price_bottle_type_volume_ml,price_bottle_quantity,...,wine_statistics_ratings_count,wine_statistics_labels_count,wine_statistics_vintages_count,wine_taste_structure_acidity,wine_taste_structure_fizziness,wine_taste_structure_intensity,wine_taste_structure_sweetness,wine_taste_structure_tannin,wine_taste_structure_user_structure_count,wine_taste_structure_calculated_structure_count
0,165.50,2011.0,32,810,2397,7854,1.0,0.0,750.0,750.0,...,6129.0,38830.0,62.0,3.878774,1.063098,4.765940,1.608170,3.415815,111.0,604.0
1,165.50,2011.0,32,810,2397,7854,1.0,0.0,750.0,750.0,...,6130.0,38841.0,62.0,3.878774,1.063098,4.765940,1.608170,3.415815,111.0,604.0
2,171.94,2016.0,32,810,2397,7854,1.0,0.0,750.0,750.0,...,6129.0,38830.0,62.0,3.878774,1.110179,4.765940,1.608170,3.415815,111.0,604.0
3,171.94,2016.0,32,810,2397,7854,1.0,0.0,750.0,750.0,...,6130.0,38841.0,62.0,3.878774,1.110179,4.765940,1.608170,3.415815,111.0,604.0
4,5.21,2020.0,27,17,4272,2161,1.0,0.0,750.0,750.0,...,8035.0,50346.0,43.0,2.974334,1.085031,3.603781,2.145398,2.885380,696.0,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,275.00,2019.0,12,1018,191,9938,1.0,0.0,750.0,750.0,...,2895.0,12445.0,49.0,3.299123,1.123026,3.877869,1.776919,3.637073,146.0,172.0
177,31.90,2018.0,34,1062,1893,5561,1.0,0.0,750.0,750.0,...,238.0,1974.0,21.0,3.574042,1.150507,3.792505,2.034040,3.818561,27.4,16.0
178,11.95,2021.0,34,1062,1809,3816,1.0,0.0,750.0,750.0,...,9128.0,49305.0,42.0,4.014763,1.150507,3.103585,1.378479,2.033312,95.0,663.0
179,14.70,2020.0,34,1062,3442,1712,2.0,0.0,750.0,750.0,...,1297.0,5787.0,34.0,3.364162,3.978566,3.882908,1.880113,3.744854,80.0,112.0


In [3]:
# Store np.log(prices) and drop column to apply scaler
prices = np.log(df_test['price_amount'])

df_test.drop(['price_amount'], axis=1, inplace=True)

In [4]:
# Load scaler and apply it
with open('../../data/models/price/FS1/MinMaxScaler.priceFS1.pkl', 'rb') as f:
    scaler_df = pickle.load(f)
df_test_cluster = pd.DataFrame(scaler_df.transform(df_test), columns = df_test.columns)

In [5]:
# Load clustering and apply it
kmeans = joblib.load('../../data/models/price/FS1/kmeans_model.priceFS1.joblib')
start_time = time.time()
clusters = kmeans.predict(df_test_cluster)
end_time = time.time()
time_taken_ms = (end_time - start_time) * 1000  # Convert to milliseconds
print(f'Time taken for prediction: {time_taken_ms:.2f} ms')

Time taken for prediction: 32.63 ms


In [6]:
# Return prices and add calculated clusters to test data
df_test['cluster'] = clusters
df_test['price_amount'] = prices

In [7]:
# Create the dictionary with cluster keys and corresponding best regressor models
regressor_models = {
    "cluster_0": "GradientBoostingRegressor",
    "cluster_1": "RandomForestRegressor",
    "cluster_2": "KNeighborsRegressor"
}

In [8]:
# Load mapping as there was a change between labels given to each cluster during testing and development in research
mapping = {
    0: 1,
    1: 0,
    2: 2
}

In [9]:
# For each cluster make predictions with corresponding best regressor model 
for i in range(0,3):
    cluster = mapping[i]

    model = joblib.load(f'../../data/models/price/FS1/{regressor_models[f"cluster_{cluster}"]}.priceFS1-cluster{cluster}.joblib')
    df_test_aux = df_test[df_test['cluster'] == cluster].copy()
    y_true = df_test_aux['price_amount']
    df_test_aux = df_test_aux.drop(['cluster', 'price_amount'], axis=1)
    start_time = time.time()
    y_pred = model.predict(df_test_aux)
    end_time = time.time()
    time_taken_ms = (end_time - start_time) * 1000  # Convert to milliseconds
    print(f'Time taken for train: {time_taken_ms:.2f} ms')
    y_true = np.exp(y_true)
    y_pred = np.exp(y_pred)
    df_test_aux['y_true'] = y_true
    df_test_aux['y_pred'] = y_pred
    display(df_test_aux[['y_true', 'y_pred']])
    print(f"MSE: {round(mean_squared_error(y_true, y_pred),2)}")
    print(f"RMSE: {round(np.sqrt(mean_squared_error(y_true, y_pred)),2)}")
    print(f"MAE: {round(mean_absolute_error(y_true, y_pred),2)}")
    print(f"MAPE: {round(mean_absolute_percentage_error(y_true, y_pred),2)}")
    print(f"R2: {round(r2_score(y_true, y_pred),2)}")
    print(f"ME: {round(max_error(y_true, y_pred),2)}")
    print(f'######################################################')

Time taken for train: 9.38 ms


Unnamed: 0,y_true,y_pred
5,13.5,14.146636
10,26.98,30.972113
11,333.5,61.197191
12,10.95,9.723346
13,23.95,27.272732
14,19.95,16.39065
16,26.9,26.628598
20,18.33,16.550216
29,17.95,29.246107
32,12.95,12.078574


MSE: 20174.69
RMSE: 142.04
MAE: 80.22
MAPE: 0.36
R2: 0.2
ME: 345.96
######################################################
Time taken for train: 1.87 ms


Unnamed: 0,y_true,y_pred
4,5.21,5.074011
6,360.00,100.951189
7,314.00,317.408016
8,349.00,77.158601
9,5.50,8.898103
...,...,...
171,13.90,14.254353
172,112.00,74.392886
174,53.50,5.877115
175,12.50,14.806722


MSE: 12647.65
RMSE: 112.46
MAE: 67.52
MAPE: 0.47
R2: 0.52
ME: 291.17
######################################################
Time taken for train: 2.89 ms


Unnamed: 0,y_true,y_pred
0,165.5,50.0
1,165.5,50.0
2,171.94,50.0
3,171.94,50.0
27,179.95,48.4
28,377.47,280.0
43,11.5,24.0
55,21.27,23.5
59,38.57,9.9
61,350.0,380.0


MSE: 5039.59
RMSE: 70.99
MAE: 53.36
MAPE: 0.94
R2: 0.76
ME: 146.37
######################################################
