In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor

import xgboost
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import catboost

In [5]:
def show_results(preds, gt):
    rmse =  np.sqrt(mean_squared_error(preds, gt))
    print('RMSE:', rmse)
    return rmse

In [6]:
ds = pd.read_csv("train.csv")

In [28]:
best_scores = {}

In [20]:
def get_nan_stat_table(dataset):
    total = dataset.isnull().sum().sort_values(ascending=False)

    percent = total / len(dataset) * 100
    nan_stat_tbl = pd.concat([total, percent], axis=1, keys=['Total', '%'])
    return nan_stat_tbl.loc[(nan_stat_tbl['%']>0)]

get_nan_stat_table(ds[cat_features])

Unnamed: 0,Total,%
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
GarageType,81,5.547945
GarageCond,81,5.547945
GarageQual,81,5.547945
GarageFinish,81,5.547945
BsmtFinType2,38,2.60274


In [30]:
def prepare_dataset(dataset):
    ds_new = dataset.drop(columns=["Id"])

    cat_features = ds_new.select_dtypes(include = ["object"]).columns
    num_features = ds_new.select_dtypes(exclude = ["object"]).columns

    ds_new[cat_features] = ds_new[cat_features].fillna('None')

    for feature in num_features:
        ds_new[feature] = ds_new[feature].fillna(ds_new[feature].mean())

    return ds_new

In [22]:
ds = prepare_dataset(ds)

ds_nocat = ds.copy()

for feature in cat_features:
    encoder = LabelEncoder()
    encoded_feature = encoder.fit_transform(ds[feature])
    ds_nocat[feature] = encoded_feature

In [23]:
y = np.log1p(ds.SalePrice.to_numpy())
x = ds_nocat.drop(columns=["SalePrice"]).to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=98987)

In [26]:
x.shape,y.shape

((1460, 79), (1460,))

In [27]:
parameters = {
    'criterion':('squared_error',), 
    'max_depth': (1000,),
    'max_features':(1/3, ),
    'n_estimators': (100, 1000),
    'min_samples_leaf': (1, 2, 8)
}

rforest = RandomForestRegressor()
rforest_gs = GridSearchCV(rforest, parameters, verbose=2)
rforest_gs.fit(x_train, y_train);

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   1.1s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   0.9s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   0.9s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   1.2s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   1.2s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=1000; total time=  11.5s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf

In [29]:
print(rforest_gs.best_params_)
best_scores["RandomForest"] = show_results(rforest_gs.predict(x_test), y_test)

{'criterion': 'squared_error', 'max_depth': 1000, 'max_features': 0.3333333333333333, 'min_samples_leaf': 1, 'n_estimators': 1000}
RMSE: 0.15588079024545187


In [30]:
parameters = {
    "learning_rate": (0.001, 0.01, ),
    "max_depth": [ 2, 4],
    "min_child_weight": [ 1, 10],
    "gamma":[ 0.0,],
    "n_estimators": [1000, 5000]
}
xgb = xgboost.XGBRegressor()
xgb_gs = GridSearchCV(xgb, parameters, verbose=2, cv=3)
xgb_gs.fit(x_train, y_train);

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   2.0s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   1.5s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   1.5s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=  12.4s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   8.6s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   9.2s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=10, n_estimators=1000; total time=   1.5s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=10, n_estimators=1000; total time=   1.5s
[CV] END gamma=0.0, learning_rate=0.001, 

In [31]:
print(xgb_gs.best_params_)
best_scores["XGBoost"] = show_results(xgb_gs.predict(x_test), y_test)

{'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 5000}
RMSE: 0.12792625107811667


In [32]:
parameters = {
    'num_leaves': (40, 20, 10,),
    'learning_rate': (0.1, 0.01, 0.05),
    'max_depth': (-1,),
    'n_estimators': (10**3, 10**4),}

lgbmr_gs = GridSearchCV(LGBMRegressor(), parameters, verbose=2)
lgbmr_gs.fit(x_train, y_train);

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   4.8s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   3.9s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   3.8s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   4.6s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   5.4s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=20; total time=   3.3s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=20; total time=   2.7s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=20; total time=   2.2s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=20; total time=   2.4s
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=20; total ti

[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=10; total time=  13.1s
[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=10; total time=   6.7s
[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=10; total time=   6.7s
[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=10; total time=   6.8s
[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=10; total time=   7.8s


In [33]:
print(lgbmr_gs.best_params_)
best_scores["LightGBM"] = show_results(lgbmr_gs.predict(x_test), y_test)

{'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 10}
RMSE: 0.14171984107096894


In [None]:
# logloss: gt * log(pred) + (1 - gt) * log(1 - pred)

# 1 * log(0.95) = 

# 1) wth is margin

# 2) 

In [20]:
ds = pd.read_csv("train.csv")

ds.drop(columns=["Id"], inplace=True)
x = ds.drop(columns=['SalePrice'])
y = np.log1p(ds['SalePrice'])

cat_features = ds.select_dtypes(include = ["object"]).columns

x[cat_features] = x[cat_features].fillna('None')

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=98987)

In [22]:
train_pool = catboost.Pool(x_train, y_train, cat_features=cat_features.tolist())
test_pool = catboost.Pool(x_test, y_test, cat_features=cat_features.tolist())

In [25]:
p_grid = {
        'learning_rate': [0.07, 0.9],
        'depth': [1, 2, 3],
        'l2_leaf_reg': [0.7, 1, 1.3],
}

catboost_cls = CatBoostRegressor(
        loss_function='RMSE',
        verbose=0
)
grid_search_results = catboost_cls.grid_search(p_grid, train_pool, shuffle=False, verbose=1, search_by_train_test_split=False)

Training on fold [0/3]

bestTest = 0.1424761025
bestIteration = 918

Training on fold [1/3]

bestTest = 0.1225894164
bestIteration = 999

Training on fold [2/3]

bestTest = 0.1422752513
bestIteration = 392

0:	loss: 0.1363115	best: 0.1363115 (0)	total: 4.09s	remaining: 1m 9s
Training on fold [0/3]

bestTest = 0.1452043185
bestIteration = 890

Training on fold [1/3]

bestTest = 0.1288349456
bestIteration = 420

Training on fold [2/3]

bestTest = 0.1584301759
bestIteration = 804

1:	loss: 0.1455148	best: 0.1363115 (0)	total: 10.2s	remaining: 1m 21s
Training on fold [0/3]

bestTest = 0.1422212522
bestIteration = 955

Training on fold [1/3]

bestTest = 0.1221175133
bestIteration = 999

Training on fold [2/3]

bestTest = 0.1427051403
bestIteration = 468

2:	loss: 0.1360531	best: 0.1360531 (2)	total: 16.6s	remaining: 1m 23s
Training on fold [0/3]

bestTest = 0.1485506428
bestIteration = 979

Training on fold [1/3]

bestTest = 0.1267461794
bestIteration = 469

Training on fold [2/3]

bestTest

In [26]:
show_results(catboost_cls.predict(test_pool), y_test)

RMSE: 0.1276739987586883


0.1276739987586883

# Kaggle 

In [35]:
test_ds = pd.read_csv('test.csv')
test_ds = prepare_dataset(test_ds)

cat_features = test_ds.select_dtypes(['object']).columns.tolist()
y_pred = catboost_cls.predict(test_ds)

y_pred = np.expm1(y_pred)

In [36]:
y_pred

array([120995.43352071, 156772.53467243, 180902.51969985, ...,
       168663.0707533 , 122296.12663951, 230285.94474493])

In [37]:
my_submission = pd.read_csv('sample_submission.csv')

In [38]:
my_submission

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [39]:
my_submission['SalePrice'] = y_pred

In [40]:
my_submission

Unnamed: 0,Id,SalePrice
0,1461,120995.433521
1,1462,156772.534672
2,1463,180902.519700
3,1464,195579.875315
4,1465,198189.797487
...,...,...
1454,2915,87224.945809
1455,2916,85925.232111
1456,2917,168663.070753
1457,2918,122296.126640


In [41]:
my_submission.to_csv('my_submission.csv',index=False)