In [1]:
# Basic libraries
import pandas as pd
import re
import logging

# Machine Learning
from sklearn.ensemble import RandomForestRegressor

# Project libraries
# set path to local modules and submodules
import sys, os
sys.path.append(os.path.abspath("src")) # add src folder to path
# import local modules and submodules
import data_ravers_utils.file_handler as fl

# Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

# setup logging level
logging.getLogger().setLevel(logging.INFO)

# Load model(s)

In [2]:
# List files and remove '.pkl' extension
model_dir = "../models"
file_names = os.listdir(model_dir)
model_names = [re.sub(r"\.pkl$", "", f) for f in file_names if f.endswith(".pkl")]

# Load models and store in a list of dicts
models_list = []

for name in model_names:
    model_var_name = f"model_{name}"
    model = fl.read_model_pickle(name)  # load model without .pkl
    globals()[model_var_name] = model  # assign to a dynamic variable name
    models_list.append({
        "model_name": name,
        "model": model
    })

# print out all models found and loaded
for entry in models_list:
    print(f"Loaded model: {entry['model_name']}")

Loaded model: linear_regressor_best_model
Loaded model: rf_classifier_best_model
Loaded model: rf_regressor_best_model
Loaded model: rf_regressor_base_model
Loaded model: rf_classifier_base_model
Loaded model: base_linear_regression_model


In [3]:
next(m for m in models_list if m["model_name"] == "base_linear_regression_model")["RMSE"] = 7.78
next(m for m in models_list if m["model_name"] == "linear_regressor_best_model")["RMSE"] = 7.78
next(m for m in models_list if m["model_name"] == "rf_regressor_base_model")["RMSE"] = 8.22
next(m for m in models_list if m["model_name"] == "rf_regressor_best_model")["RMSE"] = 6.73
next(m for m in models_list if m["model_name"] == "rf_classifier_base_model")["Accuracy"] = 0.66
next(m for m in models_list if m["model_name"] == "rf_classifier_best_model")["RMSE"] = 0.65


In [4]:
models_list

[{'model_name': 'linear_regressor_best_model',
  'model': LinearRegression(),
  'RMSE': 7.78},
 {'model_name': 'rf_classifier_best_model',
  'model': RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=10,
                         random_state=15),
  'RMSE': 0.65},
 {'model_name': 'rf_regressor_best_model',
  'model': RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=10,
                         random_state=15),
  'RMSE': 6.73},
 {'model_name': 'rf_regressor_base_model',
  'model': RandomForestRegressor(n_estimators=10, oob_score=True, random_state=15,
                        warm_start=True),
  'RMSE': 8.22},
 {'model_name': 'rf_classifier_base_model',
  'model': RandomForestClassifier(n_estimators=10, random_state=15),
  'Accuracy': 0.66},
 {'model_name': 'base_linear_regression_model',
  'model': LinearRegression(),
  'RMSE': 7.78}]

In [5]:
models_list = [
         {'model_name': 'rf_regressor_base_model',
  'model': RandomForestRegressor(n_estimators=10, oob_score=True, random_state=15,
                        warm_start=True),
  'RMSE': 8.22}
    ]

# Prepare mock data

This is a temporary solution for POC.

TODO: replace with pipeline.

In [6]:
df_filename = 'bandcamp-sales-v3-rfr'
data = fl.read_df_pickle(df_filename)
df = data.copy()
df.head(5)

Unnamed: 0,amount_paid_usd,amount_over_usd,item_price_usd,discount_usd,artist_encoded,media_type_details_encoded,discography_size,merch_type_encoded,is_bundle,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,is_weekend,is_weekday,cc_ae,cc_af,cc_ag,cc_ai,cc_al,cc_am,cc_an,cc_ar,cc_at,cc_au,cc_aw,cc_ax,cc_az,cc_ba,cc_bb,cc_bd,cc_be,cc_bg,cc_bh,cc_bi,cc_bm,cc_bn,cc_bo,cc_br,cc_bs,cc_bw,cc_by,cc_bz,cc_c2,cc_ca,cc_cf,cc_ch,cc_ci,cc_ck,cc_cl,cc_cm,cc_cn,cc_co,cc_cr,cc_cw,cc_cy,cc_cz,cc_de,cc_dk,cc_dm,cc_do,cc_dz,cc_ec,cc_ee,cc_eg,cc_es,cc_et,cc_fi,cc_fj,cc_fo,cc_fr,cc_ga,cc_gb,cc_gd,cc_ge,cc_gf,cc_gg,cc_gh,cc_gi,cc_gl,cc_gp,cc_gr,cc_gt,cc_gu,cc_gy,cc_hk,cc_hm,cc_hn,cc_hr,cc_hu,cc_hy,cc_id,cc_ie,cc_il,cc_im,cc_in,cc_is,cc_it,cc_je,cc_jm,cc_jo,cc_jp,cc_ke,cc_kg,cc_kh,cc_kn,cc_kr,cc_kw,cc_ky,cc_kz,cc_la,cc_lb,cc_lc,cc_li,cc_lk,cc_lr,cc_ls,cc_lt,cc_lu,cc_lv,cc_ly,cc_ma,cc_mc,cc_md,cc_me,cc_mk,cc_ml,cc_mm,cc_mn,cc_mo,cc_mq,cc_mt,cc_mu,cc_mv,cc_mw,cc_mx,cc_my,cc_mz,cc_na,cc_nc,cc_nf,cc_ng,cc_ni,cc_nl,cc_no,cc_np,cc_nz,cc_om,cc_pa,cc_pe,cc_pf,cc_pg,cc_ph,cc_pk,cc_pl,cc_pr,cc_ps,cc_pt,cc_pw,cc_py,cc_qa,cc_re,cc_ro,cc_rs,cc_ru,cc_rw,cc_sa,cc_sc,cc_se,cc_sg,cc_si,cc_sk,cc_sn,cc_so,cc_sv,cc_sz,cc_tc,cc_tg,cc_th,cc_tn,cc_tr,cc_tt,cc_tw,cc_tz,cc_ua,cc_ug,cc_us,cc_uy,cc_uz,cc_vc,cc_ve,cc_vi,cc_vn,cc_vu,cc_wf,cc_xk,cc_yt,cc_za,cc_zm,cc_zw,mt_Physical media,mt_Tape,mt_Vinyl,mt_bundle,mt_digital
0,9.99,0.0,9.99,0.0,9.707326,2,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1.3,0.0,1.3,0.0,8.293014,2,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,3.9,0.0,3.9,4.440892e-16,8.931315,2,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,12.39,0.0,12.39,1.776357e-15,9.247368,3,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,1.0,0.0,3.863418,4,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [7]:
target_variable = 'amount_paid_usd'
df_X = df.drop(columns=[target_variable])
df_y = df[target_variable]

In [8]:
sample = df_X.sample(1, random_state=15)
real_value = df_y.iloc[sample.index[0]]

# Prediction simulation

In [9]:
for model in models_list:
    print(f"{model['model_name']} estimates amount paid for this item by customer as:\n")
    prediction = model['model'].predict(sample)
    print(f"$ {prediction} +/-")


rf_regressor_base_model estimates amount paid for this item by customer as:



NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.