# Load data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir_path = '/content/drive/MyDrive/ML Project/data/'
train_featured = pd.read_csv(data_dir_path + 'train_X_y_features_v1.csv')
test_featured = pd.read_csv( data_dir_path + 'test_X_y_features_v1.csv')
print(train_featured.shape)
print(test_featured.shape)

(1202739, 43)
(300685, 43)


In [None]:
train_X = train_featured.drop(['title','description', 'item_id', 'user_id', 'item_seq_number', 'activation_date', 'image',
       'image_top_1', 'deal_probability'], axis=1)
test_X = test_featured.drop(['title','description', 'item_id', 'user_id', 'item_seq_number', 'activation_date', 'image',
       'image_top_1', 'deal_probability'], axis=1)
train_y = train_featured['deal_probability']
test_y = test_featured['deal_probability']

In [None]:
for col in train_X.select_dtypes(include=['object']).columns:
    # le = preprocessing.LabelEncoder()
    # le.fit(list(train_X[col].astype(str).values) + list(test_X[col].astype(str).values))
    # train_X[col] = le.transform(list(train_X[col].astype(str).values))
    train_X[col] = train_X[col].astype('category')
    test_X[col] = test_X[col].astype('category')

In [None]:
train_X.columns

Index(['region', 'city', 'parent_category_name', 'category_name', 'param_1',
       'param_2', 'param_3', 'price', 'user_type', 'region_city',
       'all_category', 'category_param_1', 'region_category_user',
       'city_category_user', 'category_price_mean', 'category_price_std',
       'category_price_skew', 'city_price_mean', 'city_price_max',
       'city_price_skew', 'title_length', 'description_length',
       'title_word_count', 'description_word_count', 'title_has_keyword',
       'description_has_keyword', 'title_digit_count',
       'description_digit_count', 'description_newline_count', 'price_log',
       'price_bin', 'price_to_category_mean', 'price_to_category_max',
       'description_missing'],
      dtype='object')

# First stage model: log_price ~ category and param

In [None]:
category_and_param_features = [
    'parent_category_name', 'category_name', 'param_1',
    'param_2', 'param_3', 'region_city', 'all_category', 'category_param_1',
    'region_category_user', 'city_category_user',
    'category_price_mean', 'category_price_std', 'category_price_skew',
    'price_log', 'price_bin', 'price_to_category_mean', 'price_to_category_max',
]

In [None]:
X_train_price_1 = train_X[category_and_param_features]
y_train_price_1 = train_X['price_log']
X_test_price_1 = test_X[category_and_param_features]

train_X_price_1, val_X_price_1, train_y_price_1, val_y_price_1 = train_test_split(
    X_train_price_1, y_train_price_1, test_size=0.2, random_state=42
)
train_X_price_1 = lgb.Dataset(train_X_price_1, label=train_y_price_1)
val_X_price_1 = lgb.Dataset(val_X_price_1, label=val_y_price_1)

model_price_catparam = lgb.train(
    {
        "objective": "regression",
        "metric": "rmse",
        "boosting": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 128,
        "max_depth": -1,
    },
    train_set=train_X_price_1,
    valid_sets=[train_X_price_1, val_X_price_1],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=40)],
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13466
[LightGBM] [Info] Number of data points in the train set: 962191, number of used features: 17
[LightGBM] [Info] Start training from score 7.409326
Training until validation scores don't improve for 50 rounds
[40]	training's rmse: 0.411689	valid_1's rmse: 0.412308
[80]	training's rmse: 0.0551249	valid_1's rmse: 0.0552906
[120]	training's rmse: 0.0155003	valid_1's rmse: 0.0169068
[160]	training's rmse: 0.0124383	valid_1's rmse: 0.0155552
[200]	training's rmse: 0.0115397	valid_1's rmse: 0.0155041
[240]	training's rmse: 0.0111189	valid_1's rmse: 0.015443
[280]	training's rmse: 0.0108202	valid_1's rmse: 0.0154017
[320]	training's rmse: 0.0105574	valid_1's rmse: 0.0153738
[360]	training's rmse: 0.0103102	valid_1's rmse: 0.01537
[400]	training's rmse: 0.0101621	valid_1's rmse: 0.0153518
[440]	traini

# First stage model: log_price ~ city region and user type

In [None]:
user_region_city_features = [
    'region', 'city', 'user_type', 'region_city',
    'region_category_user', 'city_category_user',
    'city_price_mean', 'city_price_max', 'city_price_skew',
]

X_train_price_2 = train_X[user_region_city_features]
y_train_price_2 = train_X['price_log']
X_test_price_2 = test_X[user_region_city_features]

train_X_price_2, val_X_price_2, train_y_price_2, val_y_price_2 = train_test_split(
    X_train_price_2, y_train_price_2, test_size=0.2, random_state=42
)

train_X_price_2 = lgb.Dataset(train_X_price_2, label=train_y_price_2)
val_X_price_2 = lgb.Dataset(val_X_price_2, label=val_y_price_2)

model_price_cityuser = lgb.train(
    {
        "objective": "regression",
        "metric": "rmse",
        "boosting": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 128,
        "max_depth": -1,
    },
    train_set=train_X_price_2,
    valid_sets=[train_X_price_2, val_X_price_2],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=40)],
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010647 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12731
[LightGBM] [Info] Number of data points in the train set: 962191, number of used features: 9
[LightGBM] [Info] Start training from score 7.409326
Training until validation scores don't improve for 50 rounds
[40]	training's rmse: 2.29542	valid_1's rmse: 2.32152
[80]	training's rmse: 2.26252	valid_1's rmse: 2.30557
[120]	training's rmse: 2.25325	valid_1's rmse: 2.3064
Early stopping, best iteration is:
[85]	training's rmse: 2.2608	valid_1's rmse: 2.30549


# Second Stage: Stacking Both First Stage Trees

In [None]:
params_second_stage = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 512,
    'max_depth': 20,
    'bagging_seed': 42,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'feature_fraction': 0.8,
    'lambda_l1': 2.25,
    'lambda_l2': 0.015,
    'verbose': -1
}

In [None]:
stack_train_X = train_X.copy()
stack_test_X = test_X.copy()

stack_train_X['price_log_1'] = model_price_catparam.predict(X_train_price_1)
stack_test_X['price_log_1'] = model_price_catparam.predict(X_test_price_1)

stack_train_X['price_log_2'] = model_price_cityuser.predict(X_train_price_2)
stack_test_X['price_log_2'] = model_price_cityuser.predict(X_test_price_2)

In [None]:
# for some reason (package update?) I need to use categorical indices here
# if using the fit_lgb() function in previous notebooks, will get wrong type error
categorical_features = stack_train_X.select_dtypes(include=['category']).columns.tolist()
categorical_indices = [stack_train_X.columns.get_loc(col) for col in categorical_features]

train_X_stack, val_X_stack, train_y_stack, val_y_stack = train_test_split(
    stack_train_X, train_y, test_size=0.2, random_state=42
)

train_X_stack = lgb.Dataset(train_X_stack, label=train_y_stack,
                            categorical_feature = categorical_indices)
val_X_stack = lgb.Dataset(val_X_stack, label=val_y_stack,
                          categorical_feature = categorical_indices,
                          reference=train_X_stack)

stack_model = lgb.train(
    params_second_stage,
    train_set=train_X_stack,
    valid_sets=[train_X_stack, val_X_stack],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=40)],
)

Training until validation scores don't improve for 50 rounds
[40]	training's rmse: 0.233204	valid_1's rmse: 0.237504
[80]	training's rmse: 0.224558	valid_1's rmse: 0.231588
[120]	training's rmse: 0.220617	valid_1's rmse: 0.23011
[160]	training's rmse: 0.218323	valid_1's rmse: 0.22967
[200]	training's rmse: 0.216844	valid_1's rmse: 0.229524
[240]	training's rmse: 0.216044	valid_1's rmse: 0.229492
[280]	training's rmse: 0.21553	valid_1's rmse: 0.229481
[320]	training's rmse: 0.215015	valid_1's rmse: 0.22948
[360]	training's rmse: 0.214633	valid_1's rmse: 0.22949
Early stopping, best iteration is:
[337]	training's rmse: 0.214792	valid_1's rmse: 0.229467


In [None]:
# stack_train_X.to_csv(data_dir_path + 'stack_train_X.csv', index=False)
# stack_test_X.to_csv(data_dir_path + 'stack_test_X.csv', index=False)

In [None]:
preds = stack_model.predict(stack_test_X)
rmse = mean_squared_error(test_y, preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.22790632846796596


In [None]:
feature_importances = pd.DataFrame({
    'feature': stack_model.feature_name(),
    'importance': stack_model.feature_importance()
}).sort_values(by='importance', ascending=False)

print(feature_importances)

                      feature  importance
13         city_category_user       18860
12       region_category_user       16339
21         description_length       11699
7                       price       10308
31     price_to_category_mean       10271
1                        city        9031
20               title_length        8839
23     description_word_count        7076
32      price_to_category_max        6867
34                price_log_1        5279
11           category_param_1        4804
4                     param_1        4663
27    description_digit_count        4458
5                     param_2        3748
9                 region_city        3126
6                     param_3        2997
19            city_price_skew        2237
28  description_newline_count        2213
29                  price_log        2110
22           title_word_count        1983
0                      region        1846
18             city_price_max        1699
35                price_log_2     

# Second Stage: Stacking Only Param-Category Tree

In [None]:
categorical_features = stack_train_X.select_dtypes(include=['category']).columns.tolist()

train_X_stack, val_X_stack, train_y_stack, val_y_stack = train_test_split(
    stack_train_X.drop(columns=["price_log_2"]), train_y, test_size=0.2, random_state=42
)

train_X_stack = lgb.Dataset(train_X_stack, label=train_y_stack,
                            categorical_feature = categorical_features)
val_X_stack = lgb.Dataset(val_X_stack, label=val_y_stack,
                          categorical_feature = categorical_features,
                          reference=train_X_stack)

stack_model_1 = lgb.train(
    params_second_stage,
    train_set=train_X_stack,
    valid_sets=[train_X_stack, val_X_stack],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=40)],
)

Training until validation scores don't improve for 50 rounds
[40]	training's rmse: 0.233153	valid_1's rmse: 0.237448
[80]	training's rmse: 0.224472	valid_1's rmse: 0.231602
[120]	training's rmse: 0.220567	valid_1's rmse: 0.230075
[160]	training's rmse: 0.218257	valid_1's rmse: 0.229614
[200]	training's rmse: 0.216817	valid_1's rmse: 0.229468
[240]	training's rmse: 0.216016	valid_1's rmse: 0.229435
[280]	training's rmse: 0.215354	valid_1's rmse: 0.229397
[320]	training's rmse: 0.21482	valid_1's rmse: 0.229384
[360]	training's rmse: 0.214443	valid_1's rmse: 0.229396
Early stopping, best iteration is:
[331]	training's rmse: 0.214672	valid_1's rmse: 0.229376


In [None]:
preds = stack_model_1.predict(stack_test_X.drop(columns=["price_log_2"]))
rmse = mean_squared_error(test_y, preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.22783146399739404


In [None]:
feature_importances = pd.DataFrame({
    'feature': stack_model_1.feature_name(),
    'importance': stack_model_1.feature_importance()
}).sort_values(by='importance', ascending=False)

print(feature_importances)

                      feature  importance
13         city_category_user       18340
12       region_category_user       17089
21         description_length       12216
31     price_to_category_mean       10313
1                        city        9488
7                       price        9388
20               title_length        9066
32      price_to_category_max        7416
23     description_word_count        6332
34                price_log_1        5457
11           category_param_1        5121
4                     param_1        4995
27    description_digit_count        4111
5                     param_2        3510
9                 region_city        3380
6                     param_3        2988
19            city_price_skew        2615
28  description_newline_count        2349
29                  price_log        2253
0                      region        1948
22           title_word_count        1902
18             city_price_max        1703
17            city_price_mean     

# Compare with baseline

In [None]:
baseline_train_X = train_X.copy()
baseline_test_X = test_X.copy()

train_X_baseline, val_X_baseline, train_y_baseline, val_y_baseline = train_test_split(
    baseline_train_X, train_y, test_size=0.2, random_state=42
)

train_X_baseline = lgb.Dataset(train_X_baseline, label=train_y_baseline,
                            categorical_feature = categorical_features)
val_X_baseline = lgb.Dataset(val_X_baseline, label=val_y_baseline,
                          categorical_feature = categorical_features,
                          reference=train_X_baseline)

baseline_model = lgb.train(
    params_second_stage,
    train_set=train_X_baseline,
    valid_sets=[train_X_baseline, val_X_baseline],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=40)],
)

Training until validation scores don't improve for 50 rounds
[40]	training's rmse: 0.233185	valid_1's rmse: 0.237478
[80]	training's rmse: 0.224541	valid_1's rmse: 0.231585
[120]	training's rmse: 0.220664	valid_1's rmse: 0.230057
[160]	training's rmse: 0.218385	valid_1's rmse: 0.22963
[200]	training's rmse: 0.216965	valid_1's rmse: 0.229485
[240]	training's rmse: 0.216159	valid_1's rmse: 0.229456
[280]	training's rmse: 0.215554	valid_1's rmse: 0.229425
[320]	training's rmse: 0.215048	valid_1's rmse: 0.229418
[360]	training's rmse: 0.214657	valid_1's rmse: 0.229428
Early stopping, best iteration is:
[326]	training's rmse: 0.214918	valid_1's rmse: 0.229406


In [None]:
preds = baseline_model.predict(baseline_test_X)
rmse = mean_squared_error(test_y, preds, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.22785243334704727


In [None]:
feature_importances = pd.DataFrame({
    'feature': baseline_model.feature_name(),
    'importance': baseline_model.feature_importance()
}).sort_values(by='importance', ascending=False)

print(feature_importances)

                      feature  importance
13         city_category_user       17405
12       region_category_user       17105
7                       price       12285
21         description_length       12200
31     price_to_category_mean       10804
1                        city        9359
20               title_length        8684
32      price_to_category_max        7301
23     description_word_count        6888
4                     param_1        5426
11           category_param_1        4423
27    description_digit_count        4398
5                     param_2        3683
9                 region_city        3628
6                     param_3        2818
29                  price_log        2678
19            city_price_skew        2448
28  description_newline_count        2267
22           title_word_count        2213
0                      region        2035
18             city_price_max        1734
17            city_price_mean        1598
26          title_digit_count     