In [1]:
# Basic libraries
import pandas as pd

# Project libraries
# set path to local modules and submodules
import sys, os
sys.path.append(os.path.abspath("src")) # add src folder to path
# import local modules and submodules
from data_ravers_utils.kaggle_loader import download_kaggle_dataset
from data_ravers_utils.file_handler import *
import data_ravers_utils.eda_utils as eda
import data_ravers_utils.model_linear_regressor as lr

# Settings
pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings


# Logging
import logging
# setup logging level
logging.getLogger().setLevel(logging.DEBUG)

# Prepare raw data for training base model

In [2]:
df_filename = 'bandcamp-sales-v0-raw'
data = read_df_pickle(df_filename)
df = data.copy()
df.head(5)

Unnamed: 0,_id,art_url,item_type,utc_date,country_code,track_album_slug_text,country,slug_type,amount_paid_fmt,item_price,item_description,art_id,url,amount_paid,releases,artist_name,currency,album_title,amount_paid_usd,package_image_id,amount_over_fmt,item_slug,addl_count
0,1599688803.5175&//girlbanddublin.bandcamp.com/...,https://f4.bcbits.com/img/a0206405257_7.jpg,a,1599689000.0,gb,,United Kingdom,a,$9.99,9.99,Live at Vicar Street,206405300.0,//girlbanddublin.bandcamp.com/album/live-at-vi...,9.99,,Girl Band,USD,,9.99,,,,
1,1599688805.27838&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a2984241552_7.jpg,a,1599689000.0,fi,,Finland,a,£1,1.0,Neurogen,2984242000.0,//maharettarecords.bandcamp.com/album/neurogen,1.0,,Jirah,GBP,,1.3,,,,
2,1599688805.90646&//maharettarecords.bandcamp.c...,https://f4.bcbits.com/img/a3320494770_7.jpg,a,1599689000.0,fi,,Finland,a,£3,3.0,The Last Snare Bender,3320495000.0,//maharettarecords.bandcamp.com/album/the-last...,3.0,,D-Ther,GBP,,3.9,,,,
3,1599688806.94234&//alicesitski.bandcamp.com/al...,https://f4.bcbits.com/img/0020476345_37.jpg,p,1599689000.0,gb,,United Kingdom,a,€10.50,10.5,Limited Edition Compact Disc,,//alicesitski.bandcamp.com/album/white-noise-tv,10.5,,WHITE NOISE TV,EUR,WHITE NOISE TV,12.39,20476345.0,,,
4,1599688809.07942&//linguaignota.bandcamp.com/t...,https://f4.bcbits.com/img/a3428873396_7.jpg,t,1599689000.0,us,,United States,t,$1,1.0,O Ruthless Great Divine Director,3428873000.0,//linguaignota.bandcamp.com/track/o-ruthless-g...,1.0,,LINGUA IGNOTA,USD,,1.0,,,,


In [3]:
target_variable_main = 'amount_paid_usd'

In [4]:
df[target_variable_main].describe()

count    1000000.000000
mean           8.931315
std           12.490078
min            0.200000
25%            2.000000
50%            6.000000
75%           11.630000
max         1286.260000
Name: amount_paid_usd, dtype: float64

In [5]:
target_variable_extra = "amount_over_fmt"

# Can I use the data for prediction?

df["amount_over_fmt"].apply(type).value_counts()

amount_over_fmt
<class 'float'>    880867
<class 'str'>      119133
Name: count, dtype: int64

In [6]:
# getting all numerical columns
numeric_columns = eda.get_numerical_columns(df)
numeric_columns

['utc_date',
 'item_price',
 'art_id',
 'amount_paid',
 'releases',
 'amount_paid_usd',
 'package_image_id',
 'addl_count']

In [7]:
# Fill NaN values with 0
df.fillna(0, inplace=True)

# Heads-on approach to hyperparameters testing

## For the main target

In [9]:
logging.getLogger().setLevel(logging.INFO)

# Define different subsets of features
feature_subsets = {
    "numeric_features_raw": numeric_columns
}

# Define test sizes to experiment with
test_sizes = [0.1, 0.2, 0.3, 0.4]

# Define different random_state values for variability
random_states = [15, 42, 100]

test_results_df = lr.linear_regression_combo_test(df, feature_subsets, target_variable_main, test_sizes, random_states)

test_results_df

Unnamed: 0,test_size,random_state,R2,MAE,RMSE,MSE
LR_numeric_features_raw_ts0.1_rs15,0.1,15.0,1.0,0.0,0.0001,0.0
LR_numeric_features_raw_ts0.1_rs42,0.1,42.0,1.0,0.0,0.0001,0.0
LR_numeric_features_raw_ts0.1_rs100,0.1,100.0,1.0,0.0,0.0001,0.0
LR_numeric_features_raw_ts0.2_rs15,0.2,15.0,1.0,0.0,0.0,0.0
LR_numeric_features_raw_ts0.2_rs42,0.2,42.0,1.0,0.0,0.0,0.0
LR_numeric_features_raw_ts0.2_rs100,0.2,100.0,1.0,0.0,0.0,0.0
LR_numeric_features_raw_ts0.3_rs15,0.3,15.0,1.0,0.0,0.0,0.0
LR_numeric_features_raw_ts0.3_rs42,0.3,42.0,1.0,0.0,0.0,0.0
LR_numeric_features_raw_ts0.3_rs100,0.3,100.0,1.0,0.0,0.0,0.0
LR_numeric_features_raw_ts0.4_rs15,0.4,15.0,1.0,0.0,0.0,0.0


### The best model for the main target

In [13]:
base_linear_regression_model, dict_test_results = lr.linear_regression_control(df, numeric_columns, target_variable_main, test_size=0.1, random_state=15)

for key, value in dict_test_results.items():
    print(f"{key}: {value}")

test_size: 0.1
random_state: 15
R2: 0.9999999999582607
MAE: 1.7139541815768866e-05
RMSE: 7.079246501129246e-05
MSE: 5.011573102375068e-09


# Interpretation of results

- Direct predictors 
- Imbalanced dataset

Extra target is not prepaired for modeling because it contains string values.