In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from data_processing import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from data_transformar import *
import lightgbm as lgb
from sklearn.decomposition import PCA

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set(style="darkgrid")

In [4]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

In [5]:
most_missing = get_missing_values_info(data)

most_missing[most_missing.iloc[:, 1] > 25]

Unnamed: 0,Total,%
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274


In [6]:
data=data.drop(columns=['PoolQC', 'MiscFeature','Alley', 'Fence', 'FireplaceQu'], axis=1)

In [7]:
y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

print(X.shape)
X.head()

(1460, 74)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [8]:
transformer = DataTransformer(True, StandardScaler())

X = transformer.encode(X)
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [9]:
#magical seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

In [10]:
transformer.imputer_fit(X_train)

X_train = transformer.imputer_transform(X_train)
X_test = transformer.imputer_transform(X_test)

transformer.scaler_fit(X_train)

X_train = transformer.scaler_transform(X_train)
X_test = transformer.scaler_transform(X_test)


In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(y_pred)))

#rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [12]:
def evaluate(clf, use_train = False):
    to_test_X = X_train if use_train else X_test
    to_test_y = y_train if use_train else y_test 
    
    predictions = clf.predict(to_test_X)
    print("Mean Absolute log Error : " + str(rmsle(predictions, to_test_y)))

In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit(X_train)

X_train_scaled = pca.transform(X_train)
X_test_scaled = pca.transform(X_test)

print(pca.components_.shape)

(164, 270)


In [14]:
xgb_model = xgb.XGBRegressor(n_estimators = 14000)

xgb_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)

evaluate(xgb_model, True)
evaluate(xgb_model)

Mean Absolute log Error : 0.05630455984400593
Mean Absolute log Error : 0.15270693133856422


In [15]:
xgb_model = xgb.XGBRegressor(
                colsample_bytree=1.,
                eta=0.01,
                max_depth=4,
                min_child_weight=1.5,
                n_estimators=14400,                                                                  
                alpha=0.,
                reg_lambda=0.4,
                subsample=0.2)

xgb_model.fit(X_train, y_train,  early_stopping_rounds=25, eval_set=[(X_test, y_test)], verbose=False)

evaluate(xgb_model, True)
evaluate(xgb_model)

Mean Absolute log Error : 0.09014225648030438
Mean Absolute log Error : 0.14124338952339777


In [16]:
rf_model = RandomForestRegressor()

rf_model.fit(X_train, y_train)

evaluate(rf_model, True)
evaluate(rf_model)

Mean Absolute log Error : 0.052431660373142036
Mean Absolute log Error : 0.15772160370302088


In [17]:
lgb_model = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)


lgb_model.fit(X_train, y_train)

evaluate(lgb_model, True)
evaluate(lgb_model)

Mean Absolute log Error : 0.06848372802260959
Mean Absolute log Error : 0.1385378757344686


In [21]:
from catboost import CatBoostRegressor, Pool

In [22]:
org_params = {
    'iterations': 2000,
    'learning_rate': 0.08,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'loss_function': 'RMSE',
    'od_type': 'Iter',
    'od_wait': 1000,
    'one_hot_max_size': 20,
    'l2_leaf_reg': 100,
    'depth': 3,
    'rsm': 0.6,
    'random_strength': 2,
    'bagging_temperature': 10
}

train_pool = Pool(X_train, y_train)

dev_pool = Pool(X_test, y_test)

cat_model = CatBoostRegressor(**org_params)

cat_model.fit(train_pool, eval_set=dev_pool, plot=False)

evaluate(cat_model, True)
evaluate(cat_model)

Mean Absolute log Error : 0.08797558491529363
Mean Absolute log Error : 0.1421819212354911


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor

estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    #('cat', cat_model),
    ('lgb', lgb_model),
    ('mlp', MLPRegressor(random_state=1, max_iter=300))
]

stack_model = StackingRegressor(estimators=estimators, final_estimator=xgb.XGBRegressor(
                colsample_bytree=1.,
                eta=0.01,
                max_depth=4,
                min_child_weight=1.5,
                n_estimators=14400,                                                                  
                alpha=0.,
                reg_lambda=0.4,
                subsample=0.2))

stack_model.fit(X_train, y_train)

In [None]:
evaluate(stack_model, True)
evaluate(stack_model)

Mean Absolute log Error : 0.11379243099597144
Mean Absolute log Error : 0.15466133812870175
