In [22]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [39]:
train = pd.read_csv("train.csv")
train = train.drop(["Id"], axis=1)

X = train.drop(["SalePrice"], axis=1)
y = train["SalePrice"]

cate_cols = X.select_dtypes(include=["object"]).columns
cate_features = cate_cols.tolist()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

cate_features


['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [44]:
for col in cate_features:
    X_train[col] = X_train[col].astype("category")
    X_val[col] = X_val[col].astype("category")

train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_val, label=y_val, reference=train_data)



In [63]:
params = {
    "objective": "regression",
    "metric": "l1",
}


In [66]:
bst = lgb.train(params, train_data, num_boost_round=500, valid_sets=[train_data, validation_data], valid_names=["train", "val"], callbacks=[lgb.log_evaluation(10), lgb.early_stopping(50)])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3151
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 76
[LightGBM] [Info] Start training from score 181441.541952
Training until validation scores don't improve for 50 rounds
[10]	train's l1: 24711.1	val's l1: 29191.5
[20]	train's l1: 13969.6	val's l1: 19976.2
[30]	train's l1: 10582.9	val's l1: 17604.4
[40]	train's l1: 8976.55	val's l1: 16950.2
[50]	train's l1: 7843.44	val's l1: 16563.9
[60]	train's l1: 7022.42	val's l1: 16494.2
[70]	train's l1: 6377.22	val's l1: 16400.6
[80]	train's l1: 5837.3	val's l1: 16328.1
[90]	train's l1: 5349.74	val's l1: 16309.9
[100]	train's l1: 4923.84	val's l1: 16269.4
[110]	train's l1: 4532.65	val's l1: 16225.4
[120]	train's l1: 4172.46	val's l1: 16259
[130]	train's

In [72]:
test = pd.read_csv("test.csv")
test_ids = test["Id"]
test = test.drop(["Id"], axis=1)

In [73]:
for col in cate_features:
    if col in test.columns:
        test[col] = test[col].astype("category")

In [74]:
predictions = bst.predict(test)

In [75]:
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": predictions,
})
submission.to_csv("submission.csv", index=False)