Used to generate LGBM predictions

In [1]:
import xgboost
from xgboost import XGBRegressor, XGBClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.inspection import permutation_importance

import seaborn as sns
import shap

import lightgbm as lgb

from utils import get_na_cols, params2, categorical_cols, one_hot_cols, target_encoding_cols, ordinal_cols, useless_cols, numeric_cols

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
process_train_path = 'output/processed_train.csv'
test_df_path = '../Dataset/test.csv/test.csv'
processed_test_path = 'output/processed_test.csv'

In [3]:
processed_df = pd.read_csv(process_train_path)
X = processed_df.drop(['price_doc'], axis=1)
y = processed_df['price_doc']

In [4]:
model = lgb.LGBMRegressor(n_estimators=params2['n_estimators'],
                      learning_rate=params2['eta']
                      )

model.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39955
[LightGBM] [Info] Number of data points in the train set: 15266, number of used features: 285
[LightGBM] [Info] Start training from score 15.689006


In [5]:
test_df = pd.read_csv(test_df_path)
processed_test_df = pd.read_csv(processed_test_path)
processed_test_df.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,...,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,year,month,product_type_Investment,product_type_OwnerOccupier,sub_area_te
0,39.0,20.7,2.0,9.0,0.0,1998,1,8.9,2.0,26155140.0,...,10,1,0,14,1,2015,6.0,1,0,15.587766
1,79.2,32.660037,8.0,17.0,0.0,0,3,1.0,0.0,25536300.0,...,11,0,1,12,1,2015,6.0,0,1,15.60223
2,40.5,25.1,3.0,5.0,1.0,1960,2,4.8,1.0,9946335.0,...,21,0,10,71,11,2015,6.0,1,0,15.478808
3,62.8,36.0,17.0,17.0,0.0,2016,2,62.8,2.0,21494090.0,...,10,0,0,2,0,2015,6.0,0,1,15.461166
4,40.0,40.0,17.0,17.0,0.0,0,1,1.0,0.0,25536300.0,...,12,0,1,11,1,2015,6.0,0,1,15.60223


In [7]:
pred = model.predict(processed_test_df)

# pred = np.expm1(pred)

prediction_df = pd.DataFrame({
    'id': test_df['id'],
    'price_doc': pred
}).to_csv('lgbm_pred.csv', index=False)