In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import (
    mean_absolute_error,
    r2_score
)

df = pd.read_csv('../data/dataset.csv')

In [2]:
df.head()

Unnamed: 0,ID кампании,ID баннера,Тип баннера,Тип устройства,Показы,Переходы,CTR
0,3405596,15262577,interactive,Компьютер,12596,27,0.00214
1,3405596,15257617,interactive,Смартфон,9812,30,0.00306
2,3405596,15257617,interactive,Планшет,224,2,0.00893
3,3405596,15262577,interactive,Компьютер,12511,25,0.002
4,3405596,15257617,interactive,Смартфон,9822,38,0.00387


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID кампании     1205 non-null   int64  
 1   ID баннера      1205 non-null   int64  
 2   Тип баннера     1205 non-null   object 
 3   Тип устройства  1205 non-null   object 
 4   Показы          1205 non-null   int64  
 5   Переходы        1205 non-null   int64  
 6   CTR             1205 non-null   float64
dtypes: float64(1), int64(4), object(2)
memory usage: 66.0+ KB


In [3]:
target_col = "Переходы"
leakage_cols = "CTR"

y = df[target_col]

X = df.drop(columns=[target_col, leakage_cols])

In [4]:
cat_cols = ['Тип баннера', 'Тип устройства']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [6]:
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05,
    depth=6,
    cat_features=cat_cols,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=False,
)

In [8]:
model.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=200,
    use_best_model=True,
    verbose=200
)

0:	learn: 22.9484218	test: 24.6811755	best: 24.6811755 (0)	total: 2.13ms	remaining: 4.27s
200:	learn: 5.2189918	test: 5.9458985	best: 5.9443025 (196)	total: 138ms	remaining: 1.24s
400:	learn: 4.5240408	test: 5.6945528	best: 5.6945528 (400)	total: 254ms	remaining: 1.01s
600:	learn: 4.1169058	test: 5.5814414	best: 5.5802682 (596)	total: 362ms	remaining: 843ms
800:	learn: 3.8151803	test: 5.4982943	best: 5.4982943 (800)	total: 475ms	remaining: 711ms
1000:	learn: 3.5648589	test: 5.4767382	best: 5.4713767 (977)	total: 589ms	remaining: 588ms
1200:	learn: 3.3661319	test: 5.4684974	best: 5.4682331 (1189)	total: 703ms	remaining: 467ms
1400:	learn: 3.1984643	test: 5.4652768	best: 5.4622620 (1214)	total: 818ms	remaining: 350ms
Stopped by overfitting detector  (200 iterations wait)

bestTest = 5.46226198
bestIteration = 1214

Shrink model to first 1215 iterations.


<catboost.core.CatBoostRegressor at 0x7161af9656a0>

In [None]:
preds = model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, preds)

In [None]:
r2 = r2_score(y_test, preds)

In [None]:
model.save_model("model.cbm")