In [1]:
import random
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from pathlib import Path

In [4]:
SEED = 14300631
N_FOLDS = 5

In [5]:
random.seed(SEED)
np.random.seed(SEED)

In [6]:
train = pd.read_pickle('../data/preprocessed/train_final.pkl')
test = pd.read_pickle('../data/preprocessed/test_final.pkl')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306270 entries, 0 to 306269
Columns: 470 entries, id to position_clean_99
dtypes: bool(17), category(14), datetime64[ns](1), float32(400), float64(15), int64(23)
memory usage: 571.8 MB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131259 entries, 0 to 131258
Columns: 469 entries, id to position_clean_99
dtypes: bool(17), category(14), datetime64[ns](1), float32(400), float64(14), int64(23)
memory usage: 245.0 MB


In [9]:
X_test = test.drop(['id', 'publish_date', 'publish_year'], axis=1)
train = train.drop(['id', 'publish_date'], axis=1)

In [10]:
from sklearn.metrics import mean_squared_error


X_train = train.drop(['publish_year', 'salary'], axis=1)
y_train = train['salary']

cat_features = X_train.select_dtypes('category').columns.values
# 1st model - zeros classifier
y_clf_train = (y_train > 0).astype('int')

clf_model = CatBoostClassifier(
    iterations=153,
    random_seed=SEED,
    task_type='GPU',
    learning_rate=0.04757119641099874,
    l2_leaf_reg=7.5608065020565025,
    bootstrap_type='Bayesian',
    depth=10,
    grow_policy='Depthwise',
    one_hot_max_size=2,
    nan_mode='Max',
    border_count=64,
    bagging_temperature=1.5892818323044354,
    min_data_in_leaf=1,
)
print('--------- Train zeros classifier ---------')
clf_model.fit(
    X_train,
    y_clf_train,
    cat_features=cat_features,
    verbose_eval=250,
)
# 2nd model - regressor
X_reg_train = X_train[y_train > 0]
y_reg_train = y_train[y_train > 0]
reg_model = CatBoostRegressor(
    iterations=4200,
    random_seed=SEED,
    task_type='GPU',
    learning_rate=0.0647310802468728,
    l2_leaf_reg=7.5608065020565025,
    bootstrap_type='Bayesian',
    depth=10,
    grow_policy='Depthwise',
    one_hot_max_size=2,
    nan_mode='Max',
    border_count=64,
    bagging_temperature=1.5892818323044354,
    min_data_in_leaf=1,
)
print('--------- Train regressor ---------')
reg_model.fit(
    X_reg_train,
    y_reg_train,
    cat_features=cat_features,
    verbose_eval=250,
)
test_zero_probes = clf_model.predict_proba(X_test)[:, 1]
y_test_reg = reg_model.predict(X_test)
test_predictions = np.exp(y_test_reg * test_zero_probes) - 1

--------- Train zeros classifier ---------
0:	learn: 0.5814150	total: 46ms	remaining: 6.98s
152:	learn: 0.0368327	total: 5.9s	remaining: 0us
--------- Train regressor ---------
0:	learn: 0.5844261	total: 74.1ms	remaining: 5m 11s
250:	learn: 0.3811674	total: 12.4s	remaining: 3m 14s
500:	learn: 0.3537467	total: 23.9s	remaining: 2m 56s
750:	learn: 0.3288636	total: 36.2s	remaining: 2m 46s
1000:	learn: 0.3060553	total: 48.6s	remaining: 2m 35s
1250:	learn: 0.2859334	total: 1m 1s	remaining: 2m 24s
1500:	learn: 0.2672285	total: 1m 14s	remaining: 2m 13s
1750:	learn: 0.2503100	total: 1m 27s	remaining: 2m 2s
2000:	learn: 0.2351839	total: 1m 40s	remaining: 1m 50s
2250:	learn: 0.2214685	total: 1m 53s	remaining: 1m 38s
2500:	learn: 0.2081780	total: 2m 7s	remaining: 1m 26s
2750:	learn: 0.1961402	total: 2m 20s	remaining: 1m 14s
3000:	learn: 0.1849097	total: 2m 34s	remaining: 1m 1s
3250:	learn: 0.1744188	total: 2m 47s	remaining: 48.9s
3500:	learn: 0.1648196	total: 3m 1s	remaining: 36.2s
3750:	learn: 0.

In [14]:
submit = pd.DataFrame({
    'id': test['id'],
    'salary': test_predictions,
})

In [15]:
submit.describe()

Unnamed: 0,id,salary
count,131259.0,131259.0
mean,218912.071987,34581.253842
std,126301.758751,18448.246242
min,2.0,13.896539
25%,109499.0,22697.887759
50%,218986.0,29275.223192
75%,328305.5,40297.366372
max,437528.0,301135.728827


In [16]:
submit.to_csv('../submits/catboost-optuna-final.csv', index=False)

In [13]:
test_predictions

array([22697.19481817, 50636.99456966, 31264.37598074, ...,
       39634.71558829, 33293.84812411, 24420.5652425 ])