In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import itertools
from sklearn.model_selection import cross_val_score, KFold
from tqdm import tqdm

# preprocessing

In [124]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
common_df = pd.concat([train_df, test_df])

In [125]:
null_df = common_df.isnull()
def fill_nan(feachure1, feachure2):
    val1, val2 = ('NA', None) if common_df[feachure1].dtype == "object" else (0, np.nan) 
    common_df[feachure1] = np.where(null_df[feachure1], np.where(common_df[feachure2] ==0, val1, val2), common_df[feachure1])
fill_nan('FireplaceQu', 'Fireplaces')
fill_nan('MiscFeature', 'MiscVal')
fill_nan('PoolQC', 'PoolArea')

common_df.fillna({'GarageCars':0}, inplace=True)
for feachure in ['GarageArea', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt']:
    fill_nan(feachure, 'GarageCars')

common_df.fillna({'TotalBsmtSF': 0}, inplace=True)
for feature in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF']:
    fill_nan(feature, 'TotalBsmtSF')

In [126]:
def cat_cont_coloumns(df):
    cat_numerous = ['MSSubClass'] # 'MSSubClass', MoSold?
    cat_coloumns = [cname for cname in df.columns if df[cname].dtype == "object"] + cat_numerous
    cont_coloumns = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64'] and not cname in cat_numerous]
    return cat_coloumns, cont_coloumns

cat_coloumns, cont_coloumns = cat_cont_coloumns(common_df)

In [127]:
null_df = common_df.isnull()
null_val = null_df.sum()
features_to_fill = np.array(null_val[null_val > 0].sort_values().keys())
features_to_fill = np.delete(features_to_fill, -3)
features_to_fill

array(['Electrical', 'KitchenQual', 'SaleType', 'BsmtFinType2',
       'GarageYrBlt', 'GarageQual', 'GarageFinish', 'Exterior2nd',
       'Exterior1st', 'MiscFeature', 'GarageCond', 'BsmtQual',
       'Utilities', 'BsmtFullBath', 'BsmtHalfBath', 'Functional',
       'BsmtExposure', 'PoolQC', 'BsmtCond', 'MSZoning', 'MasVnrArea',
       'MasVnrType', 'LotFrontage', 'Fence', 'Alley'], dtype=object)

In [128]:
def simple_fill_nan(df):
    values = {}
    for a in cat_coloumns:
        values[a] = -1

    for a in cont_coloumns:
        values[a] = df[a].median()
        
    return df.fillna(value=values, inplace=True)

In [129]:
simple_fill_nan(common_df)
common_df['SalePrice'] = np.log1p(common_df['SalePrice'])

In [130]:
import category_encoders as ce
def target_encoding(df):
    encoder = ce.JamesSteinEncoder(return_df=True)
    copy = df.copy(deep=True)
    copy[cat_coloumns] = encoder.fit_transform(copy[cat_coloumns], copy['SalePrice'])
    return copy

In [131]:
def split_x_y(df, y_f, drop_fs):
        y = df[y_f].to_numpy()
        encoded = target_encoding(df)
        X = encoded.drop([y_f] + drop_fs, axis=1)
        return X.to_numpy(float), y

In [132]:
from sklearn.linear_model import LinearRegression, LogisticRegression
def regression_fill_nan(features):
    for i in range(10):
        for feature in features:
            X, y = split_x_y(common_df, feature, ['Id', 'SalePrice'])
            model = LogisticRegression() if feature in cat_coloumns else LinearRegression()
            mask = null_df[feature].to_numpy()
            model.fit(X[mask == False], y[mask == False])
            init_y = common_df[feature].to_numpy()
            preds = model.predict(X[mask])
            init_y[np.where(mask)] = preds
            common_df[feature] = init_y

In [None]:
regression_fill_nan(features_to_fill)

In [136]:
encoded_common_df = target_encoding(common_df)
encoded_train_df = encoded_common_df.iloc[:1460, :]
encoded_test_df = encoded_common_df.iloc[1460:, :]

In [137]:
        
train, test = train_test_split(encoded_train_df, train_size=0.8, random_state=98987)
X_train, y_train, X_test, y_test = *split_x_y(train, 'SalePrice', ['Id']), *split_x_y(test, 'SalePrice', ['Id'])
X_shuffle, y_shuffle = shuffle(X_train, y_train, random_state=42)



In [107]:
def show(model):
    plt.figure().set_size_inches(10, 5)
    plt.subplot(1, 2, 1)
    _ = plt.plot(model.predict(X_train) - y_train)
    plt.subplot(1, 2, 2)
    _ = plt.plot(model.predict(X_test) - y_test)

# xgboost

In [154]:
n_estimators = [200, 300, 400]
max_depth = [10, 30, None]
learning_rate = [0.5, 0.1, 0.05]
hp = list(itertools.product(n_estimators, max_depth, learning_rate))

In [155]:
best_hp = None
best_score = None
for n, d, l in tqdm(hp):
       model_xgb = xgb.XGBRegressor(learning_rate=l, max_depth=d, n_estimators=n)
       acc_scores = cross_val_score(model_xgb, X_shuffle, y_shuffle,
                         scoring='neg_mean_squared_error',
                         cv=KFold())
       score = acc_scores.mean()
       if best_score is None or best_score < score:
              best_hp = (n, d, l)
              best_score = score

100%|██████████| 27/27 [01:00<00:00,  2.25s/it]


In [156]:
print(best_hp, np.sqrt(-best_score))

(400, None, 0.1) 0.1269399987032027


In [50]:
model_xgb = xgb.XGBRegressor(learning_rate=l, max_depth=d, n_estimators=n)
model_xgb.fit(X_shuffle, y_shuffle)
np.sqrt(mean_squared_error(y_test, model_xgb.predict(X_test)))

0.139483177101669

In [114]:
y0 = np.mean(y_train)
def MAE(pred, y):
    return np.mean(np.abs(pred - y))
def R2(pred, y):
    return 1 - np.sum((pred - y)**2) / np.sum((y0 - y)**2)

In [116]:
R2(model_xgb.predict(X_train), y_train), R2(model_xgb.predict(X_test), y_test)

(0.9978157610873639, 0.8847395521748376)

In [138]:
n_estimators = [200, 300, 400]
max_depth = [10, 30, None]
learning_rate = [0.5, 0.1, 0.05]
hp = list(itertools.product(n_estimators, max_depth, learning_rate))

In [139]:
best_hp = None
best_score = None
for n, d, l in tqdm(hp):
       model_xgb = xgb.XGBRegressor(learning_rate=l, max_depth=d, n_estimators=n)
       acc_scores = cross_val_score(model_xgb, X_shuffle, y_shuffle,
                         scoring='neg_mean_squared_error',
                         cv=KFold())
       score = acc_scores.mean()
       if best_score is None or best_score < score:
              best_hp = (n, d, l)
              best_score = score

100%|██████████| 27/27 [00:50<00:00,  1.87s/it]


In [140]:
print(best_hp, np.sqrt(-best_score))

(400, None, 0.05) 0.12692192896017246


# catboost

In [55]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor()
cat_model.fit(X_train, y_train, verbose=200)
np.sqrt(mean_squared_error(y_test, cat_model.predict(X_test)))

Learning rate set to 0.04196
0:	learn: 0.3876604	total: 5.5ms	remaining: 5.49s
200:	learn: 0.0869451	total: 650ms	remaining: 2.58s
400:	learn: 0.0633410	total: 1.41s	remaining: 2.1s
600:	learn: 0.0489216	total: 2.22s	remaining: 1.48s
800:	learn: 0.0388365	total: 2.83s	remaining: 702ms
999:	learn: 0.0311821	total: 3.55s	remaining: 0us


0.13525637150836145

In [130]:
boosting_type = ['Plain']
grow_policy = ['SymmetricTree', 'Depthwise']
iterations = [1000]
learning_rate = [0.05]
depth = [6]
hp = list(itertools.product(boosting_type, grow_policy, iterations, learning_rate, depth))

In [None]:
best_hp = None
best_score = None
for t, p, it, l, d in tqdm(hp):
       model_cat = CatBoostRegressor(boosting_type=t, grow_policy=p, iterations=it, learning_rate=l, depth=d)
       acc_scores = cross_val_score(model_cat, X_train, y_train,
                         scoring='neg_mean_squared_error',
                         cv=KFold(), fit_params={'verbose': 1000})
       score = np.sqrt(-acc_scores.mean())
       if best_score is None or best_score > score:
              best_hp = (t, p, it, l, d)
              best_score = score

In [135]:
print(best_hp, best_score)

('Plain', 'SymmetricTree', 1000, 0.05, 6) 0.1180101655093795


In [136]:
t, p, it, l, d = best_hp
cat_model = CatBoostRegressor(boosting_type=t, grow_policy=p, iterations=it, learning_rate=l, depth=d)
cat_model.fit(X_train, y_train, verbose=200)
np.sqrt(mean_squared_error(y_test, cat_model.predict(X_test)))

0:	learn: 0.3855221	total: 20.2ms	remaining: 20.2s
200:	learn: 0.0814176	total: 499ms	remaining: 1.99s
400:	learn: 0.0576689	total: 997ms	remaining: 1.49s
600:	learn: 0.0421539	total: 1.46s	remaining: 970ms
800:	learn: 0.0322691	total: 2s	remaining: 498ms
999:	learn: 0.0246755	total: 2.51s	remaining: 0us


(0.9961735913689298, 0.8854716447139432, 0.13570237565084797)

In [56]:
train1, test1 = train_test_split(common_df.iloc[:1460], train_size=0.8, random_state=98987)
x1, y1, x2, y2 = *split_x_y(train1), *split_x_y(test1)

In [59]:
cat_ind = np.array([common_df.columns.get_loc(i) for i in cat_coloumns]) - 1
cat_model = CatBoostRegressor(cat_features=cat_ind, boosting_type='Plain', learning_rate=0.05, depth=6)
cat_model.fit(x1, y1, verbose=200)
np.sqrt(mean_squared_error(y2, cat_model.predict(x2)))

0:	learn: 0.3856044	total: 148ms	remaining: 2m 28s
200:	learn: 0.0902794	total: 18.6s	remaining: 1m 14s
400:	learn: 0.0701437	total: 37.7s	remaining: 56.4s
600:	learn: 0.0561539	total: 58.8s	remaining: 39s
800:	learn: 0.0477540	total: 1m 24s	remaining: 21.1s
999:	learn: 0.0405829	total: 1m 47s	remaining: 0us


0.1455245226782871

In [60]:
cat_model = CatBoostRegressor(boosting_type='Ordered', grow_policy='SymmetricTree', iterations=1000, learning_rate=None, depth=6)
cat_model.fit(X_train, y_train, verbose=200)
np.sqrt(mean_squared_error(cat_model.predict(X_test), y_test))

Learning rate set to 0.04196
0:	learn: 0.3877030	total: 26.5ms	remaining: 26.4s
200:	learn: 0.0986407	total: 2.34s	remaining: 9.3s
400:	learn: 0.0825530	total: 4.97s	remaining: 7.42s
600:	learn: 0.0730927	total: 7.75s	remaining: 5.15s
800:	learn: 0.0666231	total: 10.3s	remaining: 2.56s
999:	learn: 0.0617593	total: 12.8s	remaining: 0us


0.13188083610174706

# lightgbm

In [61]:
from lightgbm import LGBMRegressor

In [75]:
boosting_type = ['gbdt', 'dart', 'goss']
best_hp = None
best_score = None
for b in boosting_type:
    model = LGBMRegressor(boosting_type=b, n_estimators=400, max_depth=10, num_leaves=512)
    acc_scores = cross_val_score(model, X_train, y_train,
                         scoring='neg_mean_squared_error',
                         cv=KFold())
    score = np.sqrt(-acc_scores.mean())
    if best_score is None or best_score > score:
        best_hp = b
        best_score = score

In [77]:
best_hp, best_score

('gbdt', 0.13133688472966787)

In [80]:
lightgbm_model = LGBMRegressor(boosting_type=best_hp, n_estimators=400, max_depth=6, num_leaves=512)
lightgbm_model.fit(X_train, y_train)
np.sqrt(mean_squared_error(y_test, lightgbm_model.predict(X_test)))

0.15130826214667545

In [None]:
R2(cat_model.predict(X_train), y_train), R2(cat_model.predict(X_test), y_test), np.sqrt(mean_squared_error(y_test, cat_model.predict(X_test)))

# sklearn

In [201]:
rfr = RandomForestRegressor(340, max_depth=40)
rfr.fit(X_train, y_train)

In [202]:
R2(rfr.predict(X_train), y_train)

0.9823486739897223

In [67]:
MAE(rfr.predict(X_train), y_train), R(rfr.predict(X_train), y_train)

(0.03482047418765675, 0.8880763807318854)

In [69]:
MAE(rfr.predict(X_test), y_test), R(rfr.predict(X_test), y_test)

(0.10298545597688918, 0.6626250959319033)

In [47]:
from sklearn.linear_model import LinearRegression
m = LogisticRegression()
m.fit(np.array([[1.0, 2.0], [3.0, 4.0]]), np.array(["jhbj", "gfdcvbn"], dtype=object))
m.predict(np.array([[5.0, 6.0]]))

array(['gfdcvbn'], dtype=object)