In [36]:
# Basic libraries
import pandas as pd
from itertools import combinations

# Project libraries
# set path to local modules and submodules
import sys, os
sys.path.append(os.path.abspath("src")) # add src folder to path
# import local modules and submodules
from data_ravers_utils.kaggle_loader import download_kaggle_dataset
import data_ravers_utils.file_handler as fl
import data_ravers_utils.eda_utils as eda
import data_ravers_utils.model_rf_regressor as rfr

# Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings


# Logging
import logging
# setup logging level
logging.getLogger().setLevel(logging.DEBUG)

# Target / Features split

Modeling strategy:
- Main Target: `amount_paid_usd` - to predict total payment.
  - Keep all rows.
  - Remove features that are directly predicting my target:
    - `overpayment`, `is_donation`.
  - Keep features:
    - `item_price`, `amount_over`, `discount`.
  - Revisit this feature selection after training model.

In [None]:
df_filename = 'bandcamp-sales-v2-encoded'
data = fl.read_df_pickle(df_filename)
df = data.copy()
df.head(5)

Unnamed: 0,amount_paid_usd,amount_over_usd,item_price_usd,overpayment_usd,discount_usd,is_donation,artist_encoded,media_type_details_encoded,discography_size,merch_type_encoded,is_bundle,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,is_weekend,is_weekday,cc_ae,cc_af,cc_ag,cc_ai,cc_al,cc_am,cc_an,cc_ar,cc_at,cc_au,cc_aw,cc_ax,cc_az,cc_ba,cc_bb,cc_bd,cc_be,cc_bg,cc_bh,cc_bi,cc_bm,cc_bn,cc_bo,cc_br,cc_bs,cc_bw,cc_by,cc_bz,cc_c2,cc_ca,cc_cf,cc_ch,cc_ci,cc_ck,cc_cl,cc_cm,cc_cn,cc_co,cc_cr,cc_cw,cc_cy,cc_cz,cc_de,cc_dk,cc_dm,cc_do,cc_dz,cc_ec,cc_ee,cc_eg,cc_es,cc_et,cc_fi,cc_fj,cc_fo,cc_fr,cc_ga,cc_gb,cc_gd,cc_ge,cc_gf,cc_gg,cc_gh,cc_gi,cc_gl,cc_gp,cc_gr,cc_gt,cc_gu,cc_gy,cc_hk,cc_hm,cc_hn,cc_hr,cc_hu,cc_hy,cc_id,cc_ie,cc_il,cc_im,cc_in,cc_is,cc_it,cc_je,cc_jm,cc_jo,cc_jp,cc_ke,cc_kg,cc_kh,cc_kn,cc_kr,cc_kw,cc_ky,cc_kz,cc_la,cc_lb,cc_lc,cc_li,cc_lk,cc_lr,cc_ls,cc_lt,cc_lu,cc_lv,cc_ly,cc_ma,cc_mc,cc_md,cc_me,cc_mk,cc_ml,cc_mm,cc_mn,cc_mo,cc_mq,cc_mt,cc_mu,cc_mv,cc_mw,cc_mx,cc_my,cc_mz,cc_na,cc_nc,cc_nf,cc_ng,cc_ni,cc_nl,cc_no,cc_np,cc_nz,cc_om,cc_pa,cc_pe,cc_pf,cc_pg,cc_ph,cc_pk,cc_pl,cc_pr,cc_ps,cc_pt,cc_pw,cc_py,cc_qa,cc_re,cc_ro,cc_rs,cc_ru,cc_rw,cc_sa,cc_sc,cc_se,cc_sg,cc_si,cc_sk,cc_sn,cc_so,cc_sv,cc_sz,cc_tc,cc_tg,cc_th,cc_tn,cc_tr,cc_tt,cc_tw,cc_tz,cc_ua,cc_ug,cc_us,cc_uy,cc_uz,cc_vc,cc_ve,cc_vi,cc_vn,cc_vu,cc_wf,cc_xk,cc_yt,cc_za,cc_zm,cc_zw,mt_Physical media,mt_Tape,mt_Vinyl,mt_bundle,mt_digital
0,9.99,0.0,9.99,2.288818e-07,0.0,1,9.707326,2,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True
1,1.3,0.0,1.3,0.0,0.0,0,8.293014,2,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True
2,3.9,0.0,3.9,0.0,4.440892e-16,0,8.931315,2,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True
3,12.39,0.0,12.39,0.0,1.776357e-15,0,9.247368,3,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,False
4,1.0,0.0,1.0,0.0,0.0,0,3.863418,4,0,2,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,True


In [18]:
features_datetime = ['hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos']
features_geo_encoded = ['cc_ae', 'cc_af', 'cc_ag', 'cc_ai', 'cc_al', 'cc_am', 'cc_an', 'cc_ar', 'cc_at', 'cc_au', 'cc_aw', 'cc_ax', 'cc_az', 'cc_ba', 'cc_bb', 'cc_bd', 'cc_be', 'cc_bg', 'cc_bh', 'cc_bi', 'cc_bm', 'cc_bn', 'cc_bo', 'cc_br', 'cc_bs', 'cc_bw', 'cc_by', 'cc_bz', 'cc_c2', 'cc_ca', 'cc_cf', 'cc_ch', 'cc_ci', 'cc_ck', 'cc_cl', 'cc_cm', 'cc_cn', 'cc_co', 'cc_cr', 'cc_cw', 'cc_cy', 'cc_cz', 'cc_de', 'cc_dk', 'cc_dm', 'cc_do', 'cc_dz', 'cc_ec', 'cc_ee', 'cc_eg', 'cc_es', 'cc_et', 'cc_fi', 'cc_fj', 'cc_fo', 'cc_fr', 'cc_ga', 'cc_gb', 'cc_gd', 'cc_ge', 'cc_gf', 'cc_gg', 'cc_gh', 'cc_gi', 'cc_gl', 'cc_gp', 'cc_gr', 'cc_gt', 'cc_gu', 'cc_gy', 'cc_hk', 'cc_hm', 'cc_hn', 'cc_hr', 'cc_hu', 'cc_hy', 'cc_id', 'cc_ie', 'cc_il', 'cc_im', 'cc_in', 'cc_is', 'cc_it', 'cc_je', 'cc_jm', 'cc_jo', 'cc_jp', 'cc_ke', 'cc_kg', 'cc_kh', 'cc_kn', 'cc_kr', 'cc_kw', 'cc_ky', 'cc_kz', 'cc_la', 'cc_lb', 'cc_lc', 'cc_li', 'cc_lk', 'cc_lr', 'cc_ls', 'cc_lt', 'cc_lu', 'cc_lv', 'cc_ly', 'cc_ma', 'cc_mc', 'cc_md', 'cc_me', 'cc_mk', 'cc_ml', 'cc_mm', 'cc_mn', 'cc_mo', 'cc_mq', 'cc_mt', 'cc_mu', 'cc_mv', 'cc_mw', 'cc_mx', 'cc_my', 'cc_mz', 'cc_na', 'cc_nc', 'cc_nf', 'cc_ng', 'cc_ni', 'cc_nl', 'cc_no', 'cc_np', 'cc_nz', 'cc_om', 'cc_pa', 'cc_pe', 'cc_pf', 'cc_pg', 'cc_ph', 'cc_pk', 'cc_pl', 'cc_pr', 'cc_ps', 'cc_pt', 'cc_pw', 'cc_py', 'cc_qa', 'cc_re', 'cc_ro', 'cc_rs', 'cc_ru', 'cc_rw', 'cc_sa', 'cc_sc', 'cc_se', 'cc_sg', 'cc_si', 'cc_sk', 'cc_sn', 'cc_so', 'cc_sv', 'cc_sz', 'cc_tc', 'cc_tg', 'cc_th', 'cc_tn', 'cc_tr', 'cc_tt', 'cc_tw', 'cc_tz', 'cc_ua', 'cc_ug', 'cc_us', 'cc_uy', 'cc_uz', 'cc_vc', 'cc_ve', 'cc_vi', 'cc_vn', 'cc_vu', 'cc_wf', 'cc_xk', 'cc_yt', 'cc_za', 'cc_zm', 'cc_zw']
features_payment = ['amount_paid_usd', 'amount_over_usd', 'item_price_usd', 'overpayment_usd', 'discount_usd', 'is_donation']
features_product_encoded = ['artist_encoded', 'discography_size', 'is_bundle', 'media_type_details_encoded', 'merch_type_encoded', 'mt_Physical media', 'mt_Tape', 'mt_Vinyl', 'mt_bundle', 'mt_digital']

In [19]:
target_variable = 'amount_paid_usd'
features_payment.remove(target_variable)

features_to_drop = ['overpayment_usd', 'is_donation']
features_payment = [col for col in features_datetime if col not in features_to_drop]
features_all = features_datetime + features_geo_encoded + features_payment + features_product_encoded

df.drop(columns=features_to_drop, inplace=True)


In [20]:
df[target_variable].describe()

count    1000000.000000
mean           8.931315
std           12.490078
min            0.200000
25%            2.000000
50%            6.000000
75%           11.630000
max         1286.260000
Name: amount_paid_usd, dtype: float64

### Save df tuned for this model

In [None]:
# save df tuned for this model
df_filename = 'bandcamp-sales-v3-rfr-X'
fl.save_df_pickle(df, df_filename)

INFO:root:Backup file is created: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/big_projects/midproject-bandcamp-insights/data/bandcamp-sales-v3-rfr-X.pkl


In [None]:
# create extra feature blends

# Define short group names for readability
short_names = {
    "features_geo_encoded": "geo",
    "features_product_encoded": "product",
    "features_payment": "payment",
    "features_datetime": "datetime"
}

# Store all blend names for feature_groups
feature_lists = list(short_names.keys())
blend_dict = {}

# Generate all blends of size 2 to 4
for r in range(2, len(feature_lists) + 1):
    for combo in combinations(feature_lists, r):
        # Create readable variable name
        blend_name = "blend_" + "_".join(short_names[var] for var in combo)
        # Create code line using actual variable names
        blend_code = " + ".join(combo)
        # Store and print
        blend_dict[blend_name] = blend_name
        print(f"{blend_name} = {blend_code}")

# Print as dictionary
print("\nfeature_groups = {")
for k, v in blend_dict.items():
    print(f'    "{k.lstrip("blend_")}": {v},')
print("}")


blend_geo_product = features_geo_encoded + features_product_encoded
blend_geo_payment = features_geo_encoded + features_payment
blend_geo_datetime = features_geo_encoded + features_datetime
blend_product_payment = features_product_encoded + features_payment
blend_product_datetime = features_product_encoded + features_datetime
blend_payment_datetime = features_payment + features_datetime
blend_geo_product_payment = features_geo_encoded + features_product_encoded + features_payment
blend_geo_product_datetime = features_geo_encoded + features_product_encoded + features_datetime
blend_geo_payment_datetime = features_geo_encoded + features_payment + features_datetime
blend_product_payment_datetime = features_product_encoded + features_payment + features_datetime
blend_geo_product_payment_datetime = features_geo_encoded + features_product_encoded + features_payment + features_datetime

feature_groups = {
    "geo_product": blend_geo_product,
    "geo_payment": blend_geo_payment,
    "geo_dat

In [22]:
blend_geo_product = features_geo_encoded + features_product_encoded
blend_geo_payment = features_geo_encoded + features_payment
blend_geo_datetime = features_geo_encoded + features_datetime
blend_product_payment = features_product_encoded + features_payment
blend_product_datetime = features_product_encoded + features_datetime
blend_payment_datetime = features_payment + features_datetime
blend_geo_product_payment = features_geo_encoded + features_product_encoded + features_payment
blend_geo_product_datetime = features_geo_encoded + features_product_encoded + features_datetime
blend_geo_payment_datetime = features_geo_encoded + features_payment + features_datetime
blend_product_payment_datetime = features_product_encoded + features_payment + features_datetime
blend_geo_product_payment_datetime = features_geo_encoded + features_product_encoded + features_payment + features_datetime

In [23]:
feature_groups = {
    "geo_product": blend_geo_product,
    "geo_payment": blend_geo_payment,
    "geo_datetime": blend_geo_datetime,
    "product_payment": blend_product_payment,
    "product_datetime": blend_product_datetime,
    "payment_datetime": blend_payment_datetime,
    "geo_product_payment": blend_geo_product_payment,
    "geo_product_datetime": blend_geo_product_datetime,
    "geo_payment_datetime": blend_geo_payment_datetime,
    "product_payment_datetime": blend_product_payment_datetime,
    "geo_product_payment_datetime": blend_geo_product_payment_datetime,
}

In [24]:
feature_groups["all"] = features_all

# Create a base model

This cell will run ~1 min:

In [25]:
base_model, dict_test_results = rfr.random_forest_regressor_control(df, features_all, target_variable, test_size=0.3, random_state=15, n_estimators=10)

for key, value in dict_test_results.items():
    print(f"{key}: {value}")

test_size: 0.3
random_state: 15
R2: 0.513371077370987
MAE: 3.1795683615445007
RMSE: 8.223994759698543
MSE: 67.6340898075491


In [35]:
# Save model to a file
save_model_pickle(model=base_model, filename='rf_regressor_base_model')

INFO:root:Model file pickle is updated: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/big_projects/midproject-bandcamp-insights/models/rf_regressor_base_model.pkl


# Hyperparameters test


### Recommended `n_estimators` values to try for Random Forest Regression

| Value      | When to Use / Why |
|------------|------------------|
| **10–50**  | Fast prototyping or small datasets. Use for quick iterations. |
| **100**    | **Default baseline**. Often good enough for many problems. |
| **200–300**| Better performance, especially for noisy or complex data. |
| **500**    | Often near the point of diminishing returns. Might help with variance reduction. |
| **1000+**  | Only if: <br>– you have **lots of data** <br>– and plenty of computation time. <br>Overkill for most cases. |

**Tips:**

- More trees = better generalization, but at a cost of slower training/prediction.
- Use `oob_score=True` to get an out-of-bag estimate without needing a separate validation set.
- Combine `n_estimators` with early stopping (e.g., using `warm_start=True`) for adaptive control.


In [11]:
# Define test n_estimators to experiment with
list_n_estimators = [10, 50]

# Define test sizes to experiment with
test_sizes = [0.1, 0.2, 0.3, 0.4]

# Define different random_state values for variability
random_states = [15, 42, 100]


In [33]:
feature_groups = {
    'all': features_all
}

This cell will run ~ :

In [34]:
logging.getLogger().setLevel(logging.DEBUG)

# Run a combo test for hyperparameters tuning
test_results_df = rfr.random_forest_regressor_combo_test(df, feature_groups, target_variable, test_sizes, random_states, list_n_estimators)
test_results_df

DEBUG:root:90.0% for training data: 900000.
DEBUG:root:10.0% for test data: 100000.
DEBUG:root:R2 = 0.5499
DEBUG:root:MAE = 3.062
DEBUG:root:RMSE = 7.3516
DEBUG:root:MSE =  54.0467
DEBUG:root:{'test_size': 0.1, 'random_state': 15, 'R2': 0.5498676219957338, 'MAE': 3.062043969173269, 'RMSE': 7.35164869708742, 'MSE': 54.04673856538716}
DEBUG:root:90.0% for training data: 900000.
DEBUG:root:10.0% for test data: 100000.
DEBUG:root:R2 = 0.5606
DEBUG:root:MAE = 2.963
DEBUG:root:RMSE = 7.2638
DEBUG:root:MSE =  52.7625
DEBUG:root:{'test_size': 0.1, 'random_state': 15, 'R2': 0.5605632534056529, 'MAE': 2.9630085179294525, 'RMSE': 7.263782051433669, 'MSE': 52.76252969072992}
DEBUG:root:90.0% for training data: 900000.
DEBUG:root:10.0% for test data: 100000.
DEBUG:root:R2 = 0.4535
DEBUG:root:MAE = 3.1448
DEBUG:root:RMSE = 9.3628
DEBUG:root:MSE =  87.6628
DEBUG:root:{'test_size': 0.1, 'random_state': 42, 'R2': 0.4535359113049807, 'MAE': 3.1448152922669577, 'RMSE': 9.362840261751069, 'MSE': 87.662777

KeyboardInterrupt: 