In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# these two csv contain all the features I've created, I will drop the useless ones later before put into the models
data_dir_path = '/content/drive/MyDrive/ML Project/data/'
train_X_featured = pd.read_csv(data_dir_path + 'train_X_y_features_v1.csv')
test_X_featured = pd.read_csv( data_dir_path + 'test_X_y_features_v1.csv')


In [None]:
def fit_lgb(
        train_X, train_y, test_X, test_y, categorical_features,
        params=None,
        num_boost_round=1000,
        early_stopping_rounds=50,
        eval_period=20,
    ):

    if params is None:
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'learning_rate': 0.02,
            'num_leaves': 512,
            'max_depth': 20,
            'bagging_seed': 42,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'feature_fraction': 0.8,
            'lambda_l1': 2.25,
            'lambda_l2': 0.015,
            'verbose': -1
        }

    # label encoding for cat var
    for col in categorical_features:
        train_X[col] = train_X[col].astype('category')
        test_X[col] = test_X[col].astype('category')

    # validation set for early stopping
    train_X_split, val_X_split, train_y_split, val_y_split = train_test_split(
        train_X, train_y, test_size=0.2, random_state=42, shuffle=True
    )
    lgb_train = lgb.Dataset(
        train_X_split, label=train_y_split, categorical_feature=categorical_features
    )
    lgb_val = lgb.Dataset(
        val_X_split, label=val_y_split, categorical_feature=categorical_features,
        reference=lgb_train
    )

    callbacks = [
        lgb.early_stopping(stopping_rounds=early_stopping_rounds),
        lgb.log_evaluation(period=eval_period)
    ]

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=num_boost_round,
        valid_sets=[lgb_train, lgb_val],
        callbacks=callbacks
    )

    preds = model.predict(test_X, num_iteration=model.best_iteration)
    rmse = mean_squared_error(test_y, preds, squared=False)
    print(f"RMSE: {rmse}")

    return model

In [None]:
train_X_featured.columns

Index(['region', 'city', 'parent_category_name', 'category_name', 'param_1',
       'param_2', 'param_3', 'price', 'user_type', 'title', 'description',
       'region_city', 'all_category', 'category_param_1',
       'region_category_user', 'city_category_user', 'category_price_mean',
       'category_price_std', 'category_price_skew', 'city_price_mean',
       'city_price_max', 'city_price_skew', 'title_length',
       'description_length', 'title_word_count', 'description_word_count',
       'title_has_keyword', 'description_has_keyword', 'title_digit_count',
       'description_digit_count', 'description_newline_count', 'price_log',
       'price_bin', 'price_to_category_mean', 'price_to_category_max',
       'description_missing', 'item_id', 'user_id', 'item_seq_number',
       'activation_date', 'image', 'image_top_1', 'deal_probability'],
      dtype='object')

In [None]:
train_X_features_intomodel = train_X_featured.drop(['title','description', 'item_id', 'user_id', 'item_seq_number', 'activation_date', 'image',
       'image_top_1', 'deal_probability'], axis=1)
test_X_features_intomodel = test_X_featured.drop(['title','description', 'item_id', 'user_id', 'item_seq_number', 'activation_date', 'image',
       'image_top_1', 'deal_probability'], axis=1)
train_X_features_intomodel

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,user_type,region_city,...,title_has_keyword,description_has_keyword,title_digit_count,description_digit_count,description_newline_count,price_log,price_bin,price_to_category_mean,price_to_category_max,description_missing
0,Омская область,Омск,Бытовая электроника,Настольные компьютеры,,,,50.0,Private,Омская область_Омск,...,True,True,0,0,0,3.931826,0,0.003888,3.333333e-05,0
1,Башкортостан,Уфа,Хобби и отдых,Спорт и отдых,Зимние виды спорта,,,500.0,Private,Башкортостан_Уфа,...,True,True,4,4,0,6.216606,1,0.044637,7.142296e-06,0
2,Нижегородская область,Дзержинск,Для дома и дачи,Растения,,,,80.0,Private,Нижегородская область_Дзержинск,...,False,False,0,15,11,4.394449,0,0.034354,7.996002e-05,0
3,Челябинская область,Миасс,Личные вещи,Детская одежда и обувь,Для девочек,Верхняя одежда,98-104 см (2-4 года),1500.0,Private,Челябинская область_Миасс,...,False,True,2,2,0,7.313887,2,0.190373,4.990013e-06,0
4,Башкортостан,Уфа,Для дома и дачи,Бытовая техника,Для кухни,Плиты,,18990.0,Shop,Башкортостан_Уфа,...,False,False,0,47,90,9.851720,4,2.354832,3.796861e-03,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202734,Ростовская область,Ростов-на-Дону,Услуги,Предложение услуг,"Ремонт, строительство",Сантехника,,,Private,Ростовская область_Ростов-на-Дону,...,False,False,0,0,0,,-1,,,0
1202735,Тульская область,Тула,Недвижимость,Квартиры,Сдам,На длительный срок,2,13500.0,Company,Тульская область_Тула,...,False,False,6,6,0,9.510519,4,0.006762,8.365307e-05,0
1202736,Белгородская область,Алексеевка,Услуги,Предложение услуг,Другое,,,700.0,Private,Белгородская область_Алексеевка,...,False,False,0,4,0,6.552508,1,0.045174,7.000000e-06,0
1202737,Челябинская область,Челябинск,Личные вещи,Товары для детей и игрушки,Детские коляски,,,9000.0,Private,Челябинская область_Челябинск,...,False,True,0,2,0,9.105091,3,2.100935,2.571282e-04,0


In [None]:
# final tabular features to be put into models
train_X_features_intomodel.columns

Index(['region', 'city', 'parent_category_name', 'category_name', 'param_1',
       'param_2', 'param_3', 'price', 'user_type', 'region_city',
       'all_category', 'category_param_1', 'region_category_user',
       'city_category_user', 'category_price_mean', 'category_price_std',
       'category_price_skew', 'city_price_mean', 'city_price_max',
       'city_price_skew', 'title_length', 'description_length',
       'title_word_count', 'description_word_count', 'title_has_keyword',
       'description_has_keyword', 'title_digit_count',
       'description_digit_count', 'description_newline_count', 'price_log',
       'price_bin', 'price_to_category_mean', 'price_to_category_max',
       'description_missing'],
      dtype='object')

In [None]:
train_y = train_X_featured['deal_probability']
test_y = test_X_featured['deal_probability']
train_y

Unnamed: 0,deal_probability
0,0.76786
1,0.00000
2,0.12311
3,0.80323
4,0.00000
...,...
1202734,0.20000
1202735,0.03703
1202736,0.20000
1202737,0.80323


In [None]:
categorical_features = [
    'region', 'city', 'parent_category_name',
    'category_name', 'param_1', 'param_2','param_3', 'user_type',
    'region_city', 'all_category','category_param_1', 'region_category_user', 'city_category_user',
    'title_has_keyword', 'description_has_keyword',
    'price_bin', 'description_missing'
]


In [None]:
len(categorical_features)

17

In [None]:
model = fit_lgb(train_X_features_intomodel, train_y, test_X_features_intomodel, test_y, categorical_features)


Training until validation scores don't improve for 50 rounds
[20]	training's rmse: 0.242532	valid_1's rmse: 0.245367
[40]	training's rmse: 0.233158	valid_1's rmse: 0.237491
[60]	training's rmse: 0.227834	valid_1's rmse: 0.233595
[80]	training's rmse: 0.224503	valid_1's rmse: 0.2316
[100]	training's rmse: 0.22222	valid_1's rmse: 0.230548
[120]	training's rmse: 0.220553	valid_1's rmse: 0.230024
[140]	training's rmse: 0.219252	valid_1's rmse: 0.229705
[160]	training's rmse: 0.218298	valid_1's rmse: 0.229566
[180]	training's rmse: 0.21744	valid_1's rmse: 0.229453
[200]	training's rmse: 0.216836	valid_1's rmse: 0.229417
[220]	training's rmse: 0.216417	valid_1's rmse: 0.229405
[240]	training's rmse: 0.216035	valid_1's rmse: 0.229391
[260]	training's rmse: 0.2157	valid_1's rmse: 0.229367
[280]	training's rmse: 0.215416	valid_1's rmse: 0.229356
[300]	training's rmse: 0.215201	valid_1's rmse: 0.22936
[320]	training's rmse: 0.214943	valid_1's rmse: 0.229353
[340]	training's rmse: 0.21473	valid_1

In [None]:
feature_importances = pd.DataFrame({
    'feature': model.feature_name(),
    'importance': model.feature_importance()
}).sort_values(by='importance', ascending=False)

print(feature_importances)

                      feature  importance
13         city_category_user       17419
12       region_category_user       17186
21         description_length       12089
7                       price       11817
31     price_to_category_mean       10862
1                        city        9171
20               title_length        8804
32      price_to_category_max        7111
23     description_word_count        7006
4                     param_1        5421
11           category_param_1        4529
27    description_digit_count        4316
5                     param_2        3575
9                 region_city        3546
6                     param_3        2789
29                  price_log        2528
19            city_price_skew        2441
28  description_newline_count        2370
22           title_word_count        2170
0                      region        2033
18             city_price_max        1760
17            city_price_mean        1586
26          title_digit_count     