In [22]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from data_processing import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from data_transformar import *
import lightgbm as lgb
from sklearn.decomposition import PCA

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set(style="darkgrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

In [24]:
most_missing = get_missing_values_info(data)

most_missing[most_missing.iloc[:, 1] > 25]

Unnamed: 0,Total,%
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274


In [25]:
data=data.drop(columns=['PoolQC', 'MiscFeature','Alley', 'Fence', 'FireplaceQu'], axis=1)

In [26]:
y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

print(X.shape)
X.head()

(1460, 74)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [27]:
#magical seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

transformer = DataTransformer(True, StandardScaler())

In [28]:
n_cols = X_train.select_dtypes([np.number]).columns
o_cols = X_train.select_dtypes([object]).columns

In [29]:
transformer.imputer_fit(X_train[n_cols])

X_train[n_cols] = transformer.imputer_transform(X_train[n_cols])
X_test[n_cols] = transformer.imputer_transform(X_test[n_cols])

transformer.scaler_fit(X_train[n_cols])

X_train[n_cols] = transformer.scaler_transform(X_train[n_cols])
X_test[n_cols] = transformer.scaler_transform(X_test[n_cols])

In [30]:
transformer.fit_encoder(X_train)

X_train = transformer.encode(X_train)
X_test = transformer.encode(X_test)

(1168, 36) (1168, 242)
(292, 36) (292, 242)


In [31]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(y_pred)))

#rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [46]:
def evaluate(clf, use_train = False):
    to_test_X = X_train if use_train else X_test
    to_test_y = y_train if use_train else y_test 
    
    predictions = clf.predict(to_test_X)
    print("Mean Absolute log Error : " + str(rmsle(predictions, to_test_y)))

def evaluate_data(clf, to_test_X, to_test_y):
    predictions = clf.predict(to_test_X)
    print("Mean Absolute log Error : " + str(rmsle(predictions, to_test_y)))

In [47]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit(X_train)

X_train_scaled = pca.transform(X_train)
X_test_scaled = pca.transform(X_test)

print(pca.components_.shape)

(73, 278)


In [49]:
X_train_num = X_train.select_dtypes([np.number])
X_test_num = X_test.select_dtypes([np.number])

In [34]:
xgb_model = xgb.XGBRegressor(n_estimators = 14000)

xgb_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)

evaluate(xgb_model, True)
evaluate(xgb_model)

Mean Absolute log Error : 0.05959184411498758
Mean Absolute log Error : 0.15335849098721704


In [50]:
xgb_model = xgb.XGBRegressor(n_estimators = 14000)

xgb_model.fit(X_train_num, y_train, early_stopping_rounds=5, eval_set=[(X_test_num, y_test)], verbose=False)

evaluate_data(xgb_model, X_test_num, y_test)

Mean Absolute log Error : 0.15335849098721704


In [35]:
xgb_model = xgb.XGBRegressor(
                colsample_bytree=1.,
                eta=0.01,
                max_depth=4,
                min_child_weight=1.5,
                n_estimators=14400,                                                                  
                alpha=0.,
                reg_lambda=0.4,
                subsample=0.2)

xgb_model.fit(X_train, y_train,  early_stopping_rounds=25, eval_set=[(X_test, y_test)], verbose=False)

evaluate(xgb_model, True)
evaluate(xgb_model)

Mean Absolute log Error : 0.09014225648030438
Mean Absolute log Error : 0.14124338952339777


In [36]:
rf_model = RandomForestRegressor()

rf_model.fit(X_train, y_train)

evaluate(rf_model, True)
evaluate(rf_model)

Mean Absolute log Error : 0.0530730028814864
Mean Absolute log Error : 0.15817312435880065


In [37]:
lgb_model = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)


lgb_model.fit(X_train, y_train)

evaluate(lgb_model, True)
evaluate(lgb_model)

Mean Absolute log Error : 0.06882442514511751
Mean Absolute log Error : 0.13838590761389413


In [38]:
from catboost import CatBoostRegressor, Pool

In [39]:
org_params = {
    'iterations': 2000,
    'learning_rate': 0.08,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'loss_function': 'RMSE',
    'od_type': 'Iter',
    'od_wait': 1000,
    'one_hot_max_size': 20,
    'l2_leaf_reg': 100,
    'depth': 3,
    'rsm': 0.6,
    'random_strength': 2,
    'bagging_temperature': 10
}

train_pool = Pool(X_train, y_train)

dev_pool = Pool(X_test, y_test)

cat_model = CatBoostRegressor(**org_params)

cat_model.fit(train_pool, eval_set=dev_pool, plot=False)

evaluate(cat_model, True)
evaluate(cat_model)

Mean Absolute log Error : 0.08856242718418833
Mean Absolute log Error : 0.1424408622777382


In [40]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor

estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    #('cat', cat_model),
    ('lgb', lgb_model),
    ('mlp', MLPRegressor(random_state=1, max_iter=300))
]

stack_model = StackingRegressor(estimators=estimators, final_estimator=xgb.XGBRegressor(
                colsample_bytree=1.,
                eta=0.01,
                max_depth=4,
                min_child_weight=1.5,
                n_estimators=14400,                                                                  
                alpha=0.,
                reg_lambda=0.4,
                subsample=0.2))

stack_model.fit(X_train, y_train)



StackingRegressor(estimators=[('xgb',
                               XGBRegressor(alpha=0.0, base_score=0.5,
                                            booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1.0,
                                            enable_categorical=False, eta=0.01,
                                            gamma=0, gpu_id=-1,
                                            importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.00999999978,
                                            max_delta_step=0, max_depth=4,
                                            min_child_weight=1.5, missing=nan,
                                            monotone...
                                               interaction_constraint

In [41]:
evaluate(stack_model, True)
evaluate(stack_model)

Mean Absolute log Error : 0.08826313757042208
Mean Absolute log Error : 0.15660270703001852


In [44]:
def blended_predictions(X):
    return ((0.2 * stack_model.predict(X)) + \
            (0.2 * lgb_model.predict(X)) + \
            (0.2 * xgb_model.predict(X)) + \
            (0.2 * cat_model.predict(X)) + \
            (0.2 * rf_model.predict(X)))

In [45]:
blended_score = rmsle(blended_predictions(X_test), y_test)
blended_score

0.1392073393290371