In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import warnings
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# Use True if you dont have your own data
USE_DEMO_DATA = True

# Specify if you have your own data
TRAIN_DATA_PATH = 'YOUR_TRAIN_DATA_PATH'
TEST_DATA_PATH = 'YOUR_TEST_DATA_PATH'

if USE_DEMO_DATA:
    # making demo data for 2 categories
    df = pd.read_json('../data/dataset.json')
    cat_2_df = df.copy()
    cat_2_df['category_id'] = 2
    train_1, test_1 = train_test_split(df, shuffle=True, random_state=42, test_size=0.078)
    train_2, test_2 = train_test_split(cat_2_df, shuffle=True, random_state=42, test_size=0.078)
    
    train_df = pd.concat([train_1, train_2]).reset_index(drop=True)
    test_df = pd.concat([test_1, test_2]).reset_index(drop=True)
else:
    # reading your data
    train_df = pd.read_json(TRAIN_DATA_PATH)
    test_df = pd.read_json(TEST_DATA_PATH)
    
train_df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,category_id,published_date,price,update_stat_date,refreshed_date,view_count,is_sticky,item_origin,extended_guarantee,nsb,...,steam_cs2_ban_type,steam_rust_kill_player,steam_rust_deaths,steam_total_gifts_rub,steam_total_refunds_rub,steam_total_ingame_rub,steam_total_games_rub,steam_total_purchased_rub,steam_dota2_last_match_date,sold_price
0,1,1695058353,360,1695227440,1695058353,9,0,fishing,-1,1,...,0,0,0,0,0,0,0,0,0,360
1,1,1689775707,115,0,1689775707,8,0,brute,0,1,...,0,0,0,0,0,0,0,0,0,115
2,1,1694338912,145,0,1695842027,71,0,autoreg,1,1,...,0,0,0,0,0,0,0,0,0,145
3,1,1694791149,649,0,1694791149,59,0,resale,0,0,...,0,0,0,0,0,0,0,0,0,649
4,1,1695738971,40,1695847124,1695738971,10,0,brute,0,1,...,0,0,0,0,0,0,0,0,0,40


In [2]:
unique_categories = train_df['category_id'].unique()

print(unique_categories)

[1 2]


#### Splitting data into separate catogories by category_id

In [3]:
categories_dfs = {}

for category in tqdm(unique_categories):
    category_df = train_df[train_df['category_id'] == category].copy()
    categories_dfs[category] = category_df
    
categories_dfs.keys()

100%|██████████| 2/2 [00:00<00:00, 11.60it/s]


dict_keys([1, 2])

## Training models for each category

In [4]:
from single_cat_model import SingleCategoryModel

categories_models = {}

for category in unique_categories:
    print("Training model for category", category, end='\n\n')
    model = SingleCategoryModel(category_number=category)
    
    model.train(categories_dfs[category])
    
    categories_models[category] = model

Training model for category 1

Initialized SingleCategoryModel for category 1.
TRAINING FOLD 1 of 3 total.

TRAINING FOLD 2 of 3 total.

TRAINING FOLD 3 of 3 total.

Training complete.
Training model for category 2

Initialized SingleCategoryModel for category 2.
TRAINING FOLD 1 of 3 total.

TRAINING FOLD 2 of 3 total.

TRAINING FOLD 3 of 3 total.

Training complete.


### Saving models to ONNX and CBM format

In [5]:
categories_models

{1: <single_cat_model.SingleCategoryModel at 0x16b078410>,
 2: <single_cat_model.SingleCategoryModel at 0x177cf5bd0>}

In [6]:
for category in categories_models.keys():
    categories_models[category].export(output_path_onnx=f'./models/onnx/category_{category}_model.onnx')

Model exported to ./models/onnx/category_1_model.onnx
Model exported to ./models/onnx/category_2_model.onnx


## Validating model for each category

### Loading available models and validating on test data

In [7]:
from single_cat_model import SingleCategoryModel
import pandas as pd
import numpy as np

# categries you saved your models for
categories = [1, 2]

test_categories_dfs = {}

# splitting test set to separate categories
for category in categories:
    test_categories_dfs[category] = test_df[test_df['category_id'] == category]
    
metrics_dict = {}

for category in categories:
    
    # initializing empty model
    model = SingleCategoryModel(category_number=category)
    # loading saved weights
    model.load_model(f'./models/onnx/category_{category}_model.onnx') # using path to CBM format!
    # validating on test set
    metrics = model.validate(
        valid_df=test_categories_dfs[category], 
        save_plot_path=f'./validation_plots/cat_{category}_pearson.png'
    )
    
    # saving metrics
    metrics_dict[category] = metrics
    
    print('\n')

Initialized SingleCategoryModel for category 1.
Loaded the model from ./models/onnx/category_1_model.onnx

Regression Metrics:

Mean Absolute Error (MAE): 28.9945
Mean Squared Error (MSE): 204498.9485
Root Mean Squared Error (RMSE): 452.2156
R² Score: 0.8733
Pearson Correlation (Full Dataset): 0.9457
Pearson Correlation (Sample Size 100): 0.9978
Pearson Correlation (Sample Size 1000): 0.9971
Pearson Correlation (Sample Size 10000): 0.9472
Pearson correlation plot saved to ./validation_plots/cat_1_pearson.png


Initialized SingleCategoryModel for category 2.
Loaded the model from ./models/onnx/category_2_model.onnx

Regression Metrics:

Mean Absolute Error (MAE): 28.9945
Mean Squared Error (MSE): 204498.9485
Root Mean Squared Error (RMSE): 452.2156
R² Score: 0.8733
Pearson Correlation (Full Dataset): 0.9457
Pearson Correlation (Sample Size 100): 0.9997
Pearson Correlation (Sample Size 1000): 0.9754
Pearson Correlation (Sample Size 10000): 0.9457
Pearson correlation plot saved to ./valid

In [8]:
metrics_dict

{1: {'mae': 28.99447410787435,
  'mse': 204498.94854526053,
  'rmse': 452.2155996261745,
  'r2': 0.8732726429267379,
  'pearson_correlation_full': 0.9456589207734637,
  'sample_pearsons': {100: 0.9977960280686196,
   1000: 0.9970809992728634,
   10000: 0.94717873166855}},
 2: {'mae': 28.99447410787435,
  'mse': 204498.94854526053,
  'rmse': 452.2155996261745,
  'r2': 0.8732726429267379,
  'pearson_correlation_full': 0.9456589207734637,
  'sample_pearsons': {100: 0.9996690478731373,
   1000: 0.9753941201798361,
   10000: 0.9456995750516152}}}

## Finetuning example

In [10]:
model.finetune(test_1)

metrics = model.validate(test_1)

print(metrics)

An error occurred during finetuning: /Users/zomb-ml-platform-msk/go-agent-21.2.0/pipelines/BuildMaster/catboost.git/catboost/libs/model/model.cpp:1885: Summation of symmetric and non-symmetric models is not supported [for now]

Regression Metrics:

Mean Absolute Error (MAE): 28.9945
Mean Squared Error (MSE): 204498.9485
Root Mean Squared Error (RMSE): 452.2156
R² Score: 0.8733
Pearson Correlation (Full Dataset): 0.9457
Pearson Correlation (Sample Size 100): 0.9985
Pearson Correlation (Sample Size 1000): 0.9663
Pearson Correlation (Sample Size 10000): 0.9456
Pearson correlation plot saved to pearson_vs_samples.png
{'mae': 28.99447410787435, 'mse': 204498.94854526053, 'rmse': 452.2155996261745, 'r2': 0.8732726429267379, 'pearson_correlation_full': 0.9456589207734637, 'sample_pearsons': {100: 0.9985431313400233, 1000: 0.9662619919939525, 10000: 0.9456029638909101}}
