In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from itertools import combinations, chain

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from joblib import dump


In [19]:
df = pd.read_csv('../data/dataset_regression.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.isnull().sum().sum()

0

In [20]:
y = df["price_usd"]
X = df.drop(["price_usd"], axis=1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [22]:
cbr = CatBoostRegressor(learning_rate=0.05, 
                        task_type="GPU",
                        devices='0:4',
                        logging_level='Verbose'
                        )

In [23]:
params = {'depth': np.arange(3, 10), 
          'l2_leaf_reg': np.arange(1, 20),
          'bootstrap_type': ['Bayesian', 'Bernoulli', 'Poisson'],
          'num_trees': np.arange(1, 20)
          }

In [24]:
catboost_regressor = cbr.randomized_search(params, X=X_train[0:1000], y=y_train[0:1000], n_iter=3)

0:	learn: 8451.8112615	test: 8238.4001554	best: 8238.4001554 (0)	total: 19.2ms	remaining: 115ms
1:	learn: 8152.7497202	test: 7932.9419915	best: 7932.9419915 (1)	total: 35.2ms	remaining: 88.1ms
2:	learn: 7867.6570134	test: 7636.6489824	best: 7636.6489824 (2)	total: 50.8ms	remaining: 67.7ms
3:	learn: 7596.7154165	test: 7346.9017361	best: 7346.9017361 (3)	total: 69.8ms	remaining: 52.4ms
4:	learn: 7348.9520559	test: 7088.3344391	best: 7088.3344391 (4)	total: 80.9ms	remaining: 32.4ms
5:	learn: 7102.3497182	test: 6817.9317450	best: 6817.9317450 (5)	total: 93.9ms	remaining: 15.7ms
6:	learn: 6871.3955497	test: 6578.2084856	best: 6578.2084856 (6)	total: 108ms	remaining: 0us
bestTest = 6578.208486
bestIteration = 6
0:	loss: 6578.2084856	best: 6578.2084856 (0)	total: 266ms	remaining: 532ms
0:	learn: 8429.4305003	test: 8228.4719359	best: 8228.4719359 (0)	total: 37.6ms	remaining: 150ms
1:	learn: 8121.4994478	test: 7903.6650941	best: 7903.6650941 (1)	total: 51.2ms	remaining: 76.7ms
2:	learn: 7809.82

In [25]:
cbr.fit(X_train, y_train)

0:	learn: 6184.6102377	total: 48.8ms	remaining: 293ms
1:	learn: 5963.5362570	total: 56.3ms	remaining: 141ms
2:	learn: 5755.0002030	total: 61.9ms	remaining: 82.6ms
3:	learn: 5559.4049814	total: 70.6ms	remaining: 53ms
4:	learn: 5378.6377673	total: 76.1ms	remaining: 30.4ms
5:	learn: 5204.8724747	total: 82.1ms	remaining: 13.7ms
6:	learn: 5041.9601950	total: 89.8ms	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1c37c31cca0>

In [26]:
cbr_predict = cbr.predict(X_test)
cbr_predict

array([7703.65632248, 6761.1138792 , 5695.45965958, ..., 5608.8406601 ,
       5551.60674286, 6911.96682167])

In [27]:
errors = {'MAE': mean_absolute_error(cbr_predict, y_test), 
          'RMSE': mean_squared_error(cbr_predict, y_test),
          'MSE': mean_squared_error(cbr_predict, y_test)**0.5,
          'MAPE': mean_absolute_percentage_error(cbr_predict, y_test),
          'R^2': cbr.score(X_test, y_test)}
errors

{'MAE': 3505.192553043869,
 'RMSE': 25634782.20675658,
 'MSE': 5063.080308148053,
 'MAPE': 0.5096251560451207,
 'R^2': 0.3828437219591503}