In [None]:
# Basic libraries
import pandas as pd
import logging

# Machine learning
from sklearn.preprocessing import LabelEncoder

# Project libraries
# set path to local modules and submodules
import sys, os
sys.path.append(os.path.abspath("src")) # add src folder to path
# import local modules and submodules
import data_ravers_utils.file_handler as fl
import data_ravers_utils.eda_utils as eda
import data_ravers_utils.encoding_categorical as enc

# Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

# setup logging level
logging.getLogger().setLevel(logging.INFO)

# Restore backup dataset and feature groups

In [13]:
df_filename = 'bandcamp-sales-v1-cleaned'
data = fl.read_df_pickle(df_filename)
df = data.copy()
df.head(5)

Unnamed: 0,country_code,country,artist_name,amount_paid_usd,hour,dayofweek,month,year,weekday,weekend,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,media_type,media_type_details,discography_size,merch_type,is_bundle,amount_over_usd,item_price_usd,overpayment_usd,discount_usd,is_donation
0,gb,United Kingdom,Girl Band,9.99,22,2,9,2020,2,False,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,digital,album,0,other,0,0.0,9.99,2.288818e-07,0.0,1
1,fi,Finland,Jirah,1.3,22,2,9,2020,2,False,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,digital,album,0,other,0,0.0,1.3,0.0,0.0,0
2,fi,Finland,D-Ther,3.9,22,2,9,2020,2,False,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,digital,album,0,other,0,0.0,3.9,0.0,4.440892e-16,0
3,gb,United Kingdom,WHITE NOISE TV,12.39,22,2,9,2020,2,False,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,CD,no details,0,other,0,0.0,12.39,0.0,1.776357e-15,0
4,us,United States,LINGUA IGNOTA,1.0,22,2,9,2020,2,False,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,digital,single,0,other,0,0.0,1.0,0.0,0.0,0


In [8]:
features_all = df.columns.to_list()
print(features_all)

['country_code', 'country', 'artist_name', 'amount_paid_usd', 'hour', 'dayofweek', 'month', 'year', 'weekday', 'weekend', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'media_type', 'media_type_details', 'discography_size', 'merch_type', 'is_bundle', 'amount_over_usd', 'item_price_usd', 'overpayment_usd', 'discount_usd', 'is_donation']


In [5]:
features_payment = ['amount_paid_usd', 'amount_over_usd', 'item_price_usd', 'overpayment_usd', 'discount_usd', 'is_donation']

In [6]:
features_datetime = ['hour', 'dayofweek', 'month', 'year', 'weekday', 'weekend', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos']

In [40]:
features_product = ['artist_name', 'media_type', 'media_type_details', 'discography_size', 'merch_type', 'is_bundle']

In [35]:
features_geo = ['country_code', 'country']

EDA and cleanup results summary:
- Numeric features:
  - All in `features_payment` list.
    - `is_donation` in a boolean categorical flag encoded in '0/1'.
  - All in features_datetime list.
    - **Cyclical features (sin / cos)** are more suitable for machine learning.
    - Regular date time features are more suitable for visualisation.
  - Some numeric features in `features_product` list:
    - `discography_size`
    - `is_bundle` - (1/0) flag, binary encoded boolean.
- Categoric features:
  - The most of features in `features_product` list.
  - All in `features_geo` list.


Copy of modeling strategy:
- Finalysing two target columns: 
  - Main Target: `amount_paid_usd` - to predict total payment.
  - Extra target: `is_donation` - to predict volunterly contribution.
- Changing columns:
  - `discount`: rename `Underpaymnet`, make sure 0 for no discount case.
  - `is_donation`: new column.
- For main target - only regressor model(s):
  - Keep all rows.
  - Remove features that are directly predicting my target:
    - `overpayment`, `is_donation`.
  - Keep features:
    - `item_price`, `amount_over`, `discount`.
  - Revisit this feature selection after training model.
- For extra target - only calssificator model(s):
  - New column `is_donation` with values 0 or 1.
  - What to consider as donation case true / false from `overpayment` and `amount over`?
    - Rule: `amount_over_usd` > 0 OR `overpayment` > 0.
  - Create subset for all rows.
  - Do not include features that directly set the donation flag in the train subdset:
    - `amount_over_usd`, `overpayment`, `amount_paid`.
  - Keep `item_price`, `discount`. Corellated, but not directly.
- Drop irrelevant columns:
  - foreign currency related, keep only `_usd` columns.


New extra action plan steps:
- Drop the date time homan readable columns before modeling, keep only cyclical features.
- Categorical features need encoding.

# Drop date time human-friednly features, keep cyclical

`Weekend` column has been created this way:
```python
df["weekend"] = df["utc_date"].dt.weekday >= 5
```

That makes it boolean categorical. It has no cyclical analogues. Need to have [0/1] encoding pair and to be droped.
Similar flag should be set to mark weekdays. 

In [18]:
df["is_weekend"] = df["weekend"]*1
df["is_weekday"] = (df["weekday"] <= 5) * 1
features_datetime.append("is_weekend")
features_datetime.append("is_weekday")

In [44]:
# TODO: move to eda_utils
features_to_drop = ['hour', 'dayofweek', 'month', 'year', 'weekday', 'weekend']
features_datetime = [col for col in features_datetime if col not in features_to_drop]
df.drop(columns=features_to_drop, inplace=True)

KeyError: "['hour', 'dayofweek', 'month', 'year', 'weekday', 'weekend'] not found in axis"

In [21]:
# Reorder columns
ordered_columns = features_payment + features_product + features_geo + features_datetime
df = df[ordered_columns]

In [22]:
df.head()

Unnamed: 0,amount_paid_usd,amount_over_usd,item_price_usd,overpayment_usd,discount_usd,is_donation,artist_name,media_type,media_type_details,discography_size,merch_type,is_bundle,country_code,country,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,is_weekend,is_weekday
0,9.99,0.0,9.99,2.288818e-07,0.0,1,Girl Band,digital,album,0,other,0,gb,United Kingdom,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
1,1.3,0.0,1.3,0.0,0.0,0,Jirah,digital,album,0,other,0,fi,Finland,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
2,3.9,0.0,3.9,0.0,4.440892e-16,0,D-Ther,digital,album,0,other,0,fi,Finland,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
3,12.39,0.0,12.39,0.0,1.776357e-15,0,WHITE NOISE TV,CD,no details,0,other,0,gb,United Kingdom,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
4,1.0,0.0,1.0,0.0,0.0,0,LINGUA IGNOTA,digital,single,0,other,0,us,United States,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1


### BACKUP CHECKPOINT

In [23]:
# save backup
df_filename = 'bandcamp-sales-v11-encoding'
fl.save_df_pickle(df, df_filename)

INFO:root:Backup file is created: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/big_projects/midproject-bandcamp-insights/data/bandcamp-sales-v11-encoding.pkl


In [45]:
# restore backup
df_filename = 'bandcamp-sales-v11-encoding'
data = fl.read_df_pickle(df_filename)
df = data.copy()
df.head(5)

Unnamed: 0,amount_paid_usd,amount_over_usd,item_price_usd,overpayment_usd,discount_usd,is_donation,artist_name,media_type,media_type_details,discography_size,merch_type,is_bundle,country_code,country,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,is_weekend,is_weekday
0,9.99,0.0,9.99,2.288818e-07,0.0,1,Girl Band,digital,album,0,other,0,gb,United Kingdom,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
1,1.3,0.0,1.3,0.0,0.0,0,Jirah,digital,album,0,other,0,fi,Finland,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
2,3.9,0.0,3.9,0.0,4.440892e-16,0,D-Ther,digital,album,0,other,0,fi,Finland,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
3,12.39,0.0,12.39,0.0,1.776357e-15,0,WHITE NOISE TV,CD,no details,0,other,0,gb,United Kingdom,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1
4,1.0,0.0,1.0,0.0,0.0,0,LINGUA IGNOTA,digital,single,0,other,0,us,United States,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1


# Encoding geo information

- Goal is to avoid high cardinality and duplicated information in the features.
- One-hot ecndoing will be used for RandomForest regressor.
- If XGBoost model will be trained later, it is recommended to use label encoding for it instead.

In [46]:
top_countries = df.groupby(features_geo).size().reset_index(name='count')
top_countries = top_countries.sort_values(by='count', ascending=False).head(10)
print(top_countries)

    country_code         country   count
172           us   United States  398999
58            gb  United Kingdom  149124
43            de         Germany   86589
10            au       Australia   55761
30            ca          Canada   42451
56            fr          France   39250
87            jp           Japan   32544
129           nl     Netherlands   17106
32            ch     Switzerland   16214
51            es           Spain   11784


In [47]:
# One-hot encode country_code
# drop_first=True avoids dummy variable trap in linear models.
df = pd.get_dummies(df, columns=["country_code"], prefix="cc", drop_first=True)


In [48]:
print(df.columns.to_list())

['amount_paid_usd', 'amount_over_usd', 'item_price_usd', 'overpayment_usd', 'discount_usd', 'is_donation', 'artist_name', 'media_type', 'media_type_details', 'discography_size', 'merch_type', 'is_bundle', 'country', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_weekday', 'cc_ae', 'cc_af', 'cc_ag', 'cc_ai', 'cc_al', 'cc_am', 'cc_an', 'cc_ar', 'cc_at', 'cc_au', 'cc_aw', 'cc_ax', 'cc_az', 'cc_ba', 'cc_bb', 'cc_bd', 'cc_be', 'cc_bg', 'cc_bh', 'cc_bi', 'cc_bm', 'cc_bn', 'cc_bo', 'cc_br', 'cc_bs', 'cc_bw', 'cc_by', 'cc_bz', 'cc_c2', 'cc_ca', 'cc_cf', 'cc_ch', 'cc_ci', 'cc_ck', 'cc_cl', 'cc_cm', 'cc_cn', 'cc_co', 'cc_cr', 'cc_cw', 'cc_cy', 'cc_cz', 'cc_de', 'cc_dk', 'cc_dm', 'cc_do', 'cc_dz', 'cc_ec', 'cc_ee', 'cc_eg', 'cc_es', 'cc_et', 'cc_fi', 'cc_fj', 'cc_fo', 'cc_fr', 'cc_ga', 'cc_gb', 'cc_gd', 'cc_ge', 'cc_gf', 'cc_gg', 'cc_gh', 'cc_gi', 'cc_gl', 'cc_gp', 'cc_gr', 'cc_gt', 'cc_gu', 'cc_gy', 'cc_hk', 'cc_hm', 'cc_hn', 'cc_hr', 'cc_hu', '

In [49]:
# 'country_code' is already droped when one-hot encoding
df.drop(columns=['country'], inplace=True)

In [8]:
features_geo_encoded = ['cc_ae', 'cc_af', 'cc_ag', 'cc_ai', 'cc_al', 'cc_am', 'cc_an', 'cc_ar', 'cc_at', 'cc_au', 'cc_aw', 'cc_ax', 'cc_az', 'cc_ba', 'cc_bb', 'cc_bd', 'cc_be', 'cc_bg', 'cc_bh', 'cc_bi', 'cc_bm', 'cc_bn', 'cc_bo', 'cc_br', 'cc_bs', 'cc_bw', 'cc_by', 'cc_bz', 'cc_c2', 'cc_ca', 'cc_cf', 'cc_ch', 'cc_ci', 'cc_ck', 'cc_cl', 'cc_cm', 'cc_cn', 'cc_co', 'cc_cr', 'cc_cw', 'cc_cy', 'cc_cz', 'cc_de', 'cc_dk', 'cc_dm', 'cc_do', 'cc_dz', 'cc_ec', 'cc_ee', 'cc_eg', 'cc_es', 'cc_et', 'cc_fi', 'cc_fj', 'cc_fo', 'cc_fr', 'cc_ga', 'cc_gb', 'cc_gd', 'cc_ge', 'cc_gf', 'cc_gg', 'cc_gh', 'cc_gi', 'cc_gl', 'cc_gp', 'cc_gr', 'cc_gt', 'cc_gu', 'cc_gy', 'cc_hk', 'cc_hm', 'cc_hn', 'cc_hr', 'cc_hu', 'cc_hy', 'cc_id', 'cc_ie', 'cc_il', 'cc_im', 'cc_in', 'cc_is', 'cc_it', 'cc_je', 'cc_jm', 'cc_jo', 'cc_jp', 'cc_ke', 'cc_kg', 'cc_kh', 'cc_kn', 'cc_kr', 'cc_kw', 'cc_ky', 'cc_kz', 'cc_la', 'cc_lb', 'cc_lc', 'cc_li', 'cc_lk', 'cc_lr', 'cc_ls', 'cc_lt', 'cc_lu', 'cc_lv', 'cc_ly', 'cc_ma', 'cc_mc', 'cc_md', 'cc_me', 'cc_mk', 'cc_ml', 'cc_mm', 'cc_mn', 'cc_mo', 'cc_mq', 'cc_mt', 'cc_mu', 'cc_mv', 'cc_mw', 'cc_mx', 'cc_my', 'cc_mz', 'cc_na', 'cc_nc', 'cc_nf', 'cc_ng', 'cc_ni', 'cc_nl', 'cc_no', 'cc_np', 'cc_nz', 'cc_om', 'cc_pa', 'cc_pe', 'cc_pf', 'cc_pg', 'cc_ph', 'cc_pk', 'cc_pl', 'cc_pr', 'cc_ps', 'cc_pt', 'cc_pw', 'cc_py', 'cc_qa', 'cc_re', 'cc_ro', 'cc_rs', 'cc_ru', 'cc_rw', 'cc_sa', 'cc_sc', 'cc_se', 'cc_sg', 'cc_si', 'cc_sk', 'cc_sn', 'cc_so', 'cc_sv', 'cc_sz', 'cc_tc', 'cc_tg', 'cc_th', 'cc_tn', 'cc_tr', 'cc_tt', 'cc_tw', 'cc_tz', 'cc_ua', 'cc_ug', 'cc_us', 'cc_uy', 'cc_uz', 'cc_vc', 'cc_ve', 'cc_vi', 'cc_vn', 'cc_vu', 'cc_wf', 'cc_xk', 'cc_yt', 'cc_za', 'cc_zm', 'cc_zw']

In [54]:
df[features_geo_encoded] = df[features_geo_encoded] * 1

### BACKUP CHECKPOINT

In [52]:
# save backup
df_filename = 'bandcamp-sales-v12-encoding'
fl.save_df_pickle(df, df_filename)

INFO:root:Backup file is created: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/big_projects/midproject-bandcamp-insights/data/bandcamp-sales-v12-encoding.pkl


In [19]:
# restore backup
df_filename = 'bandcamp-sales-v12-encoding'
data = fl.read_df_pickle(df_filename)
df = data.copy()
df.head(5)

Unnamed: 0,amount_paid_usd,amount_over_usd,item_price_usd,overpayment_usd,discount_usd,is_donation,artist_name,media_type,media_type_details,discography_size,merch_type,is_bundle,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,is_weekend,is_weekday,cc_ae,cc_af,cc_ag,cc_ai,cc_al,cc_am,cc_an,cc_ar,cc_at,cc_au,cc_aw,cc_ax,cc_az,cc_ba,cc_bb,cc_bd,cc_be,cc_bg,cc_bh,cc_bi,cc_bm,cc_bn,cc_bo,cc_br,cc_bs,cc_bw,cc_by,cc_bz,cc_c2,cc_ca,cc_cf,cc_ch,cc_ci,cc_ck,cc_cl,cc_cm,cc_cn,cc_co,cc_cr,cc_cw,cc_cy,cc_cz,cc_de,cc_dk,cc_dm,cc_do,cc_dz,cc_ec,cc_ee,cc_eg,cc_es,cc_et,cc_fi,cc_fj,cc_fo,cc_fr,cc_ga,cc_gb,cc_gd,cc_ge,cc_gf,cc_gg,cc_gh,cc_gi,cc_gl,cc_gp,cc_gr,cc_gt,cc_gu,cc_gy,cc_hk,cc_hm,cc_hn,cc_hr,cc_hu,cc_hy,cc_id,cc_ie,cc_il,cc_im,cc_in,cc_is,cc_it,cc_je,cc_jm,cc_jo,cc_jp,cc_ke,cc_kg,cc_kh,cc_kn,cc_kr,cc_kw,cc_ky,cc_kz,cc_la,cc_lb,cc_lc,cc_li,cc_lk,cc_lr,cc_ls,cc_lt,cc_lu,cc_lv,cc_ly,cc_ma,cc_mc,cc_md,cc_me,cc_mk,cc_ml,cc_mm,cc_mn,cc_mo,cc_mq,cc_mt,cc_mu,cc_mv,cc_mw,cc_mx,cc_my,cc_mz,cc_na,cc_nc,cc_nf,cc_ng,cc_ni,cc_nl,cc_no,cc_np,cc_nz,cc_om,cc_pa,cc_pe,cc_pf,cc_pg,cc_ph,cc_pk,cc_pl,cc_pr,cc_ps,cc_pt,cc_pw,cc_py,cc_qa,cc_re,cc_ro,cc_rs,cc_ru,cc_rw,cc_sa,cc_sc,cc_se,cc_sg,cc_si,cc_sk,cc_sn,cc_so,cc_sv,cc_sz,cc_tc,cc_tg,cc_th,cc_tn,cc_tr,cc_tt,cc_tw,cc_tz,cc_ua,cc_ug,cc_us,cc_uy,cc_uz,cc_vc,cc_ve,cc_vi,cc_vn,cc_vu,cc_wf,cc_xk,cc_yt,cc_za,cc_zm,cc_zw
0,9.99,0.0,9.99,2.288818e-07,0.0,1,Girl Band,digital,album,0,other,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.3,0.0,1.3,0.0,0.0,0,Jirah,digital,album,0,other,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3.9,0.0,3.9,0.0,4.440892e-16,0,D-Ther,digital,album,0,other,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,12.39,0.0,12.39,0.0,1.776357e-15,0,WHITE NOISE TV,CD,no details,0,other,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,1.0,0.0,0.0,0,LINGUA IGNOTA,digital,single,0,other,0,-0.5,0.866025,0.974928,-0.222521,-1.0,-1.83697e-16,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


# Encoding product features

In [10]:
print(features_product)

['artist_name', 'media_type', 'media_type_details', 'discography_size', 'merch_type', 'is_bundle']


In [24]:
from IPython.utils.capture import capture_output

# this code will be captured in memory and not displayed
with capture_output() as output:
    for col in features_product:
        frequency_table = eda.get_frequency_table(df, col)
        print(frequency_table)
        print()

In [25]:
print(output.stdout)

report_path = f'{fl.PROJECT_ROOT}/docs/features_product_frequency.md'

with open(report_path, "w") as f:
    f.write(output.stdout)

print(f"Output has been saved to file:\n{report_path}")


                                              artist_name  count
0                                         Various Artists  10931
1                                       Barbara Marciniak   4628
2                                                   SAULT   4526
3       Various Artists Working to Avert the Collapse ...   3479
4                        King Gizzard & The Lizard Wizard   2806
...                                                   ...    ...
159742                                  Yider x Jason Hou      1
159743                                           Hundreds      1
159744                                           OBLIVION      1
159745                                         Roman Ring      1
159746                   Shane McGowan and David Munnelly      1

[159747 rows x 2 columns]

       media_type   count
0         digital  764183
1           Vinyl   99367
2              CD   46480
3          bundle   42843
4  Physical media   24649
5            Tape   22478

         

## artist_name

- (159,747 unique values)
- Problem: Extremely high cardinality.
- Impact: One-hot encoding = explosion in feature space; Label encoding = arbitrary numerical order.
- Solution:
  - Target Encoding (aka Mean Encoding): Replace each artist with the mean of the target variable for that artist. Works well for tree-based models like Random Forest.
  - To reduce variance for rare categories, you can add Bayesian smoothing.

In [20]:
df['artist_encoded'] = enc.target_encode_cv_smooth(df, cat_col='artist_name', target_col='amount_paid_usd', k=10, n_splits=5, random_state=87)

In [21]:
df['artist_encoded'].sample(5)

16344     6.621543
876524    9.208797
888141    2.287762
775950    7.884721
122129    6.287543
Name: artist_encoded, dtype: float64

In [24]:
# Reorder columns to place 'artist_encoded' right after 'artist_name'
columns = df.columns.tolist()
artist_index = columns.index('artist_name')
columns.insert(artist_index + 1, columns.pop(columns.index('artist_encoded')))
df = df[columns]

## media_type
- (6 categories)
- digital, Vinyl, CD, bundle, Physical media, Tape
- Solution:
  - One-hot encoding is perfect here. Few values, all important.

In [None]:
df = pd.get_dummies(df, columns=["media_type"], prefix="mt", drop_first=True)

In [46]:
features_product.remove('media_type')

In [25]:
print(df.columns.to_list())

['amount_paid_usd', 'amount_over_usd', 'item_price_usd', 'overpayment_usd', 'discount_usd', 'is_donation', 'artist_name', 'artist_encoded', 'media_type_details', 'discography_size', 'merch_type', 'is_bundle', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_weekday', 'cc_ae', 'cc_af', 'cc_ag', 'cc_ai', 'cc_al', 'cc_am', 'cc_an', 'cc_ar', 'cc_at', 'cc_au', 'cc_aw', 'cc_ax', 'cc_az', 'cc_ba', 'cc_bb', 'cc_bd', 'cc_be', 'cc_bg', 'cc_bh', 'cc_bi', 'cc_bm', 'cc_bn', 'cc_bo', 'cc_br', 'cc_bs', 'cc_bw', 'cc_by', 'cc_bz', 'cc_c2', 'cc_ca', 'cc_cf', 'cc_ch', 'cc_ci', 'cc_ck', 'cc_cl', 'cc_cm', 'cc_cn', 'cc_co', 'cc_cr', 'cc_cw', 'cc_cy', 'cc_cz', 'cc_de', 'cc_dk', 'cc_dm', 'cc_do', 'cc_dz', 'cc_ec', 'cc_ee', 'cc_eg', 'cc_es', 'cc_et', 'cc_fi', 'cc_fj', 'cc_fo', 'cc_fr', 'cc_ga', 'cc_gb', 'cc_gd', 'cc_ge', 'cc_gf', 'cc_gg', 'cc_gh', 'cc_gi', 'cc_gl', 'cc_gp', 'cc_gr', 'cc_gt', 'cc_gu', 'cc_gy', 'cc_hk', 'cc_hm', 'cc_hn', 'cc_hr', 'cc_hu', 'cc_hy',

In [None]:
features_media_type_encoded = ['mt_Physical media', 'mt_Tape', 'mt_Vinyl', 'mt_bundle', 'mt_digital']

In [None]:
df[features_media_type_encoded] = df[features_media_type_encoded] * 1
df[features_media_type_encoded]

Unnamed: 0,mt_Physical media,mt_Tape,mt_Vinyl,mt_bundle,mt_digital
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,0
4,0,0,0,0,1
...,...,...,...,...,...
999995,0,0,1,0,0
999996,0,0,0,0,1
999997,0,0,0,0,1
999998,0,0,0,1,0


## media_type_details 
- (221 categories)
- This is messy due to inconsistent labeling (LP, lp, 12", 12, etc.)
- Decision:
  - Normalize similar values before encoding (e.g., lowercase, remove punctuation).
  - Group rare values into “other” or “misc”.
  - Then label encode or use frequency encoding.

In [29]:
frequent_list = df["media_type_details"].value_counts().head(13).index.to_list()

df["media_type_details_cleaned"] = df["media_type_details"].str.lower().str.replace(r'\W+', '', regex=True)
df["media_type_details_cleaned"] = df["media_type_details_cleaned"].apply(lambda x: x if x in frequent_list else "other")
le = LabelEncoder()
df["media_type_details_encoded"] = le.fit_transform(df["media_type_details_cleaned"])

In [31]:
df.drop(columns=["media_type_details_cleaned"], inplace=True)

In [38]:
# TODO: move to eda_utils
# Reorder columns to place 'media_type_details_encoded' right after 'media_type_details'
columns = df.columns.tolist()
artist_index = columns.index('media_type_details')
columns.insert(artist_index + 1, columns.pop(columns.index('media_type_details_encoded')))
df = df[columns]

In [32]:
eda.get_frequency_table(df, "media_type_details_encoded")

Unnamed: 0,media_type_details_encoded,count
0,2,481584
1,4,271015
2,3,149783
3,6,33264
4,0,31726
5,5,24649
6,1,7979


## merch_type
- (17 values)
- Mostly other, with ~20 interpretable values
- Solution:
  - Label encode.
  - Group rare ones.

In [34]:
# Get top 6 most frequent actual types (excluding 'other')
frequent_list = df["merch_type"].value_counts().iloc[1:7].index.to_list()  # exclude the first, which is already 'other'

# Replace rare types with 'other'
df["merch_type_encoded"] = df["merch_type"].apply(lambda x: x if x in frequent_list else "other")

# Label encode the result
le = LabelEncoder()
df["merch_type_encoded"] = le.fit_transform(df["merch_type_encoded"])


In [39]:
# sort columns to pair encoded column with initial
columns = df.columns.tolist()
artist_index = columns.index('merch_type')
columns.insert(artist_index + 1, columns.pop(columns.index('merch_type_encoded')))
df = df[columns]

In [36]:
eda.get_frequency_table(df, "merch_type_encoded")

Unnamed: 0,merch_type_encoded,count
0,2,968744
1,5,22519
2,6,2075
3,0,1843
4,4,1788
5,1,1566
6,3,1465


## Cleanup

In [42]:
print(features_product)
print(df.columns.to_list())

['artist_name', 'media_type', 'media_type_details', 'discography_size', 'merch_type', 'is_bundle']
['amount_paid_usd', 'amount_over_usd', 'item_price_usd', 'overpayment_usd', 'discount_usd', 'is_donation', 'artist_name', 'artist_encoded', 'media_type_details', 'media_type_details_encoded', 'discography_size', 'merch_type', 'merch_type_encoded', 'is_bundle', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_weekday', 'cc_ae', 'cc_af', 'cc_ag', 'cc_ai', 'cc_al', 'cc_am', 'cc_an', 'cc_ar', 'cc_at', 'cc_au', 'cc_aw', 'cc_ax', 'cc_az', 'cc_ba', 'cc_bb', 'cc_bd', 'cc_be', 'cc_bg', 'cc_bh', 'cc_bi', 'cc_bm', 'cc_bn', 'cc_bo', 'cc_br', 'cc_bs', 'cc_bw', 'cc_by', 'cc_bz', 'cc_c2', 'cc_ca', 'cc_cf', 'cc_ch', 'cc_ci', 'cc_ck', 'cc_cl', 'cc_cm', 'cc_cn', 'cc_co', 'cc_cr', 'cc_cw', 'cc_cy', 'cc_cz', 'cc_de', 'cc_dk', 'cc_dm', 'cc_do', 'cc_dz', 'cc_ec', 'cc_ee', 'cc_eg', 'cc_es', 'cc_et', 'cc_fi', 'cc_fj', 'cc_fo', 'cc_fr', 'cc_ga', 'cc_gb', 'cc_gd', '

In [43]:
features_product_encoded = ['artist_encoded', 'discography_size', 'is_bundle', 'media_type_details_encoded', 'merch_type_encoded', 'mt_Physical media', 'mt_Tape', 'mt_Vinyl', 'mt_bundle', 'mt_digital']

In [47]:
features_to_drop = [col for col in features_product if col not in features_product_encoded]
df.drop(columns=features_to_drop, inplace=True)

In [48]:
print(features_product_encoded)
print(df.columns.to_list())

['artist_encoded', 'discography_size', 'is_bundle', 'media_type_details_encoded', 'merch_type_encoded', 'mt_Physical media', 'mt_Tape', 'mt_Vinyl', 'mt_bundle', 'mt_digital']
['amount_paid_usd', 'amount_over_usd', 'item_price_usd', 'overpayment_usd', 'discount_usd', 'is_donation', 'artist_encoded', 'media_type_details_encoded', 'discography_size', 'merch_type_encoded', 'is_bundle', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_weekday', 'cc_ae', 'cc_af', 'cc_ag', 'cc_ai', 'cc_al', 'cc_am', 'cc_an', 'cc_ar', 'cc_at', 'cc_au', 'cc_aw', 'cc_ax', 'cc_az', 'cc_ba', 'cc_bb', 'cc_bd', 'cc_be', 'cc_bg', 'cc_bh', 'cc_bi', 'cc_bm', 'cc_bn', 'cc_bo', 'cc_br', 'cc_bs', 'cc_bw', 'cc_by', 'cc_bz', 'cc_c2', 'cc_ca', 'cc_cf', 'cc_ch', 'cc_ci', 'cc_ck', 'cc_cl', 'cc_cm', 'cc_cn', 'cc_co', 'cc_cr', 'cc_cw', 'cc_cy', 'cc_cz', 'cc_de', 'cc_dk', 'cc_dm', 'cc_do', 'cc_dz', 'cc_ec', 'cc_ee', 'cc_eg', 'cc_es', 'cc_et', 'cc_fi', 'cc_fj', 'cc_fo', 'cc_fr', 'cc

# Final backup

Run the cell bellow and copy output as code in the next notebook:

In [51]:
# TODO: store as disctionary in json and read from it, make a method in fl or eda_utils
print(f"features_datetime = {features_datetime}")
print(f"features_geo_encoded = {features_geo_encoded}")
print(f"features_payment = {features_payment}")
print(f"features_product_encoded = {features_product_encoded}")

features_datetime = ['hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos']
features_geo_encoded = ['cc_ae', 'cc_af', 'cc_ag', 'cc_ai', 'cc_al', 'cc_am', 'cc_an', 'cc_ar', 'cc_at', 'cc_au', 'cc_aw', 'cc_ax', 'cc_az', 'cc_ba', 'cc_bb', 'cc_bd', 'cc_be', 'cc_bg', 'cc_bh', 'cc_bi', 'cc_bm', 'cc_bn', 'cc_bo', 'cc_br', 'cc_bs', 'cc_bw', 'cc_by', 'cc_bz', 'cc_c2', 'cc_ca', 'cc_cf', 'cc_ch', 'cc_ci', 'cc_ck', 'cc_cl', 'cc_cm', 'cc_cn', 'cc_co', 'cc_cr', 'cc_cw', 'cc_cy', 'cc_cz', 'cc_de', 'cc_dk', 'cc_dm', 'cc_do', 'cc_dz', 'cc_ec', 'cc_ee', 'cc_eg', 'cc_es', 'cc_et', 'cc_fi', 'cc_fj', 'cc_fo', 'cc_fr', 'cc_ga', 'cc_gb', 'cc_gd', 'cc_ge', 'cc_gf', 'cc_gg', 'cc_gh', 'cc_gi', 'cc_gl', 'cc_gp', 'cc_gr', 'cc_gt', 'cc_gu', 'cc_gy', 'cc_hk', 'cc_hm', 'cc_hn', 'cc_hr', 'cc_hu', 'cc_hy', 'cc_id', 'cc_ie', 'cc_il', 'cc_im', 'cc_in', 'cc_is', 'cc_it', 'cc_je', 'cc_jm', 'cc_jo', 'cc_jp', 'cc_ke', 'cc_kg', 'cc_kh', 'cc_kn', 'cc_kr', 'cc_kw', 'cc_ky', 'cc_kz', 'cc_la', 'cc_lb', 'c

In [49]:
# save backup
df_filename = 'bandcamp-sales-v2-encoded'
fl.save_df_pickle(df, df_filename)

INFO:root:Backup file is created: /Users/bubblegum_doubledrops/Library/Mobile Documents/com~apple~CloudDocs/0prio - Important heavy backups/IronHack/big_projects/midproject-bandcamp-insights/data/bandcamp-sales-v2-encoded.pkl
