In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import (
    mean_absolute_error,
    r2_score
)

df = pd.read_csv('../data/dataset.csv')

In [2]:
df.head()

Unnamed: 0,ID кампании,ID баннера,Тип баннера,Тип устройства,Показы,Переходы,CTR
0,3405596,15262577,interactive,Компьютер,12596,27,0.2144
1,3405596,15257617,interactive,Смартфон,9812,30,0.3057
2,3405596,15257617,interactive,Планшет,224,2,0.8929
3,3405596,15262577,interactive,Компьютер,12511,25,0.1998
4,3405596,15257617,interactive,Смартфон,9822,38,0.3869


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID кампании     1205 non-null   int64  
 1   ID баннера      1205 non-null   int64  
 2   Тип баннера     1205 non-null   object 
 3   Тип устройства  1205 non-null   object 
 4   Показы          1205 non-null   int64  
 5   Переходы        1205 non-null   int64  
 6   CTR             1205 non-null   float64
dtypes: float64(1), int64(4), object(2)
memory usage: 66.0+ KB


In [4]:
target_col = "CTR"
leakage_cols = "Переходы"

y = df[target_col]

X = df.drop(columns=[target_col, leakage_cols])

In [5]:
cat_cols = ['Тип баннера', 'Тип устройства']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [7]:
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05,
    depth=6,
    cat_features=cat_cols,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=False,
)

In [8]:
model.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=200,
    use_best_model=True,
    verbose=200
)

0:	learn: 10.5401877	test: 1.6548614	best: 1.6548614 (0)	total: 48.1ms	remaining: 1m 36s
200:	learn: 4.0361835	test: 1.7529546	best: 1.6502884 (1)	total: 289ms	remaining: 2.59s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 1.650288417
bestIteration = 1

Shrink model to first 2 iterations.


<catboost.core.CatBoostRegressor at 0x7307f73556a0>

In [9]:
preds = model.predict(X_test)

In [10]:
mae = mean_absolute_error(y_test, preds)

In [11]:
r2 = r2_score(y_test, preds)

In [12]:
model.save_model("model.cbm")