In [1]:
# CatBoost

!pip install seaborn catboost -q

import numpy as np
import pandas as pd

In [2]:
path = "train.csv"
train = pd.read_csv(path)

In [3]:
cat_features = ["model", "car_type", "fuel_type"]
targets = ["target_class", "target_reg"]
features2drop = ["car_id"] 

filtered_features = [i for i in train.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

print("cat_features", cat_features)
print("num_features", len(num_features))
print("targets", targets)

for c in cat_features:
    train[c] = train[c].astype(str)

cat_features ['model', 'car_type', 'fuel_type']
num_features 11
targets ['target_class', 'target_reg']


In [4]:
from sklearn.model_selection import train_test_split

X = train[filtered_features].drop(targets, axis=1, errors="ignore")
y = train["target_reg"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

model = CatBoostRegressor(
    cat_features=cat_features, eval_metric="RMSE"
)

model.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    verbose=500,
    plot=False,
)

print(model.best_score_)

Learning rate set to 0.056174
0:	learn: 17.1114797	test: 17.8473672	best: 17.8473672 (0)	total: 175ms	remaining: 2m 54s
500:	learn: 7.3441758	test: 12.4335676	best: 12.1585875 (158)	total: 6.6s	remaining: 6.58s
999:	learn: 5.2987756	test: 12.6087656	best: 12.1585875 (158)	total: 13s	remaining: 0us

bestTest = 12.15858747
bestIteration = 158

Shrink model to first 159 iterations.
{'learn': {'RMSE': 5.29877563377732}, 'validation': {'RMSE': 12.158587472625873}}


In [6]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

# Тюнинг гиперпараметров модели

cbr = CatBoostRegressor(
        depth=4,
        iterations=1000,
        learning_rate=0.09,   
        cat_features=cat_features,
        colsample_bylevel=0.99,
        max_bin=190,
        l2_leaf_reg=5,
        subsample=0.5,)

cbr.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        verbose=500,
        plot=False)


print(cbr.best_score_)

0:	learn: 16.8709193	test: 17.5903213	best: 17.5903213 (0)	total: 10.9ms	remaining: 10.9s
500:	learn: 8.6450532	test: 12.2367538	best: 11.9853255 (111)	total: 3.93s	remaining: 3.91s
999:	learn: 7.2909046	test: 12.3802641	best: 11.9853255 (111)	total: 7.82s	remaining: 0us

bestTest = 11.98532547
bestIteration = 111

Shrink model to first 112 iterations.
{'learn': {'RMSE': 7.2909045759399325}, 'validation': {'RMSE': 11.985325469871208}}


In [7]:
test = pd.read_csv('test.csv')
for c in cat_features:
    test[c] = test[c].astype(str)
    
x_test = test[filtered_features].drop(targets, axis=1, errors="ignore")

y_pred = cbr.predict(x_test)
CatBoostReg_result = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
CatBoostReg_result.to_csv('CatBoostReg_result.csv')

In [8]:
# LightGBM

!pip install lightgbm -q

import lightgbm as lgb

df = pd.read_csv("train.csv")
cat_cols = ["car_type", "fuel_type", "model"]
drop_cols = ["car_id", "target_reg", "target_class"]

X = df.drop(drop_cols, axis=1)
y = df["target_reg"]

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in cat_cols:
    X[col] = le.fit_transform(X[col])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from lightgbm import Dataset

train_data = Dataset(
    X_train,
    y_train,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

val_data = Dataset(
    X_test,
    y_test,
    categorical_feature=cat_cols,
    free_raw_data=False,
)

In [12]:
from lightgbm import train ,LGBMRegressor


reg = LGBMRegressor(metric = "RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)
print(reg.best_score_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 1869, number of used features: 13
[LightGBM] [Info] Start training from score 44.797913
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('rmse', 12.286649812737657)])})


In [13]:
# Тюнинг гиперпараметров модели

reg = LGBMRegressor(
    n_estimators=40,
    learning_rate = 0.09,
    cat_feature=[0, 1, 2],
    num_leaves = 8,
    metric="RMSE")

reg.fit(
    X_train,
    y_train,
    eval_set=[
        (X_test, y_test),
    ],
)

print(reg.best_score_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1636
[LightGBM] [Info] Number of data points in the train set: 1869, number of used features: 13
[LightGBM] [Info] Start training from score 44.797913
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('rmse', 11.980153054703948)])})


In [14]:
test = pd.read_csv("test.csv")

drop_cols = ["car_id", "target_class"]
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = le.fit_transform(x_test[col])

y_pred = reg.predict(x_test)
LGBMReg_result = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
LGBMReg_result.to_csv('LGBMReg_result.csv')

In [15]:
# XGBoost
!pip install xgboost -q
import xgboost as xgb
import warnings; warnings.filterwarnings("ignore")

In [16]:
trainDF = pd.read_csv("train.csv")
drop_cols = ['car_id', 'target_reg', 'target_class']
cat_cols = ['car_type', 'fuel_type', 'model']

X = trainDF.drop(drop_cols, axis=1)
y = trainDF['target_reg']

 
for col in cat_cols:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.metrics import mean_poisson_deviance, mean_squared_error

reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=30, n_jobs=-1
)
reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

[0]	validation_0-rmse:15.73818	validation_0-mean_poisson_deviance:4.83291
[3]	validation_0-rmse:13.04271	validation_0-mean_poisson_deviance:3.25132
[6]	validation_0-rmse:12.64687	validation_0-mean_poisson_deviance:3.07632
[9]	validation_0-rmse:12.65160	validation_0-mean_poisson_deviance:3.07671
[12]	validation_0-rmse:12.67780	validation_0-mean_poisson_deviance:3.08867
[15]	validation_0-rmse:12.75195	validation_0-mean_poisson_deviance:3.13961
[18]	validation_0-rmse:12.88637	validation_0-mean_poisson_deviance:3.20034


12.621295288019933

In [18]:
# Тюнинг гиперпараметров модели

reg = xgb.XGBRegressor(tree_method="hist",
                       eval_metric = mean_poisson_deviance,
                       enable_categorical=True,
                       n_estimators=40, 
                       n_jobs=-1,
                       min_child_weight=32,
                       max_bin=128,
                       reg_lambda=275,)

reg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=3)

y_pred = reg.predict(X_test, iteration_range=(0, reg.best_iteration + 1))
mean_squared_error(y_test, y_pred) ** 0.5

[0]	validation_0-rmse:16.97257	validation_0-mean_poisson_deviance:5.67316
[3]	validation_0-rmse:14.55563	validation_0-mean_poisson_deviance:4.06194
[6]	validation_0-rmse:13.44591	validation_0-mean_poisson_deviance:3.44482
[9]	validation_0-rmse:12.88900	validation_0-mean_poisson_deviance:3.16361
[12]	validation_0-rmse:12.50357	validation_0-mean_poisson_deviance:2.97651
[15]	validation_0-rmse:12.27803	validation_0-mean_poisson_deviance:2.87298
[18]	validation_0-rmse:12.14974	validation_0-mean_poisson_deviance:2.82255
[21]	validation_0-rmse:12.04237	validation_0-mean_poisson_deviance:2.77504
[24]	validation_0-rmse:12.01053	validation_0-mean_poisson_deviance:2.77109
[27]	validation_0-rmse:11.96352	validation_0-mean_poisson_deviance:2.75434
[30]	validation_0-rmse:11.95310	validation_0-mean_poisson_deviance:2.75635
[33]	validation_0-rmse:11.94365	validation_0-mean_poisson_deviance:2.75827
[36]	validation_0-rmse:11.91017	validation_0-mean_poisson_deviance:2.74676
[39]	validation_0-rmse:11.906

11.90601931806027

In [19]:
test = pd.read_csv('test.csv')

drop_cols = ['car_id', 'target_class']
x_test = test.drop(drop_cols, axis=1)

for col in cat_cols:
    x_test[col] = x_test[col].astype('category')

y_pred = reg.predict(x_test)
XGBReg_result = pd.DataFrame({'car_id': test['car_id'], 'target_reg': y_pred})
XGBReg_result.to_csv('XGBReg_result.csv')