In [31]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from data_processing import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from data_transformar import *
import lightgbm as lgb

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set(style="darkgrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

In [19]:
most_missing = get_missing_values_info(data)

most_missing[most_missing.iloc[:, 1] > 25]

Unnamed: 0,Total,%
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274


In [20]:
data=data.drop(columns=['PoolQC', 'MiscFeature','Alley', 'Fence', 'FireplaceQu'], axis=1)

In [21]:
y = data["SalePrice"]
X = data.drop(columns=["SalePrice"])

print(X.shape)
X.head()

(1460, 74)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [22]:
transformer = DataTransformer(True, StandardScaler())

X = transformer.encode(X)
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [23]:
#magical seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

In [24]:
transformer.imputer_fit(X_train)

X_train = transformer.imputer_transform(X_train)
X_test = transformer.imputer_transform(X_test)

transformer.scaler_fit(X_train)

X_train = transformer.scaler_transform(X_train)
X_test = transformer.scaler_transform(X_test)


In [25]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

def rmsle(y, y_pred):
    #return np.sqrt(mean_squared_error(y, y_pred))

    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(y_pred)))

#rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [26]:
def evaluate(clf): 
    predictions = clf.predict(X_test)
    print("Mean Absolute log Error : " + str(rmsle(predictions, y_test)))

In [557]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit(X_train)

X_train_scaled = pca.transform(X_train)
X_test_scaled = pca.transform(X_test)

print(pca.components_.shape)

(176, 288)


In [27]:
model = xgb.XGBRegressor(n_estimators = 14000)

model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)

evaluate(model)

Mean Absolute log Error : 0.1600449537131745


In [28]:
model = xgb.XGBRegressor(
                colsample_bytree=1.,
                eta=0.01,
                max_depth=4,
                min_child_weight=1.5,
                n_estimators=14400,                                                                  
                alpha=0.,
                reg_lambda=0.4,
                subsample=0.2)

model.fit(X_train, y_train,  early_stopping_rounds=25, eval_set=[(X_test, y_test)], verbose=False)

evaluate(model)

Mean Absolute log Error : 0.15288768234400057


In [29]:
model = RandomForestRegressor()

model.fit(X_train, y_train)
evaluate(model)

Mean Absolute log Error : 0.16261177610298136


In [32]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)


model_lgb.fit(X_train, y_train)
evaluate(model_lgb)

Mean Absolute log Error : 0.14435421676119137


In [39]:
from catboost import CatBoostRegressor, Pool

In [51]:
model = CatBoostRegressor(iterations=1000, 
                          depth=10, 
                          learning_rate=0.25, 
                          loss_function='RMSE')

model.fit(X_train, y_train)
evaluate(model)


0:	learn: 67878.0555098	total: 35.6ms	remaining: 35.6s
1:	learn: 58969.0840194	total: 71.2ms	remaining: 35.5s
2:	learn: 50772.5326083	total: 106ms	remaining: 35.3s
3:	learn: 45373.1067033	total: 139ms	remaining: 34.6s
4:	learn: 40311.9598041	total: 185ms	remaining: 36.8s
5:	learn: 36174.1716827	total: 220ms	remaining: 36.5s
6:	learn: 32849.3488549	total: 258ms	remaining: 36.6s
7:	learn: 29800.1014748	total: 302ms	remaining: 37.5s
8:	learn: 27669.4833686	total: 341ms	remaining: 37.6s
9:	learn: 25895.5975093	total: 374ms	remaining: 37s
10:	learn: 24301.9850510	total: 407ms	remaining: 36.6s
11:	learn: 23083.7637001	total: 442ms	remaining: 36.4s
12:	learn: 22028.0488679	total: 477ms	remaining: 36.3s
13:	learn: 21110.6060374	total: 525ms	remaining: 37s
14:	learn: 20364.9045427	total: 570ms	remaining: 37.4s
15:	learn: 19595.1552863	total: 613ms	remaining: 37.7s
16:	learn: 18925.8577912	total: 659ms	remaining: 38.1s
17:	learn: 18288.8544668	total: 704ms	remaining: 38.4s
18:	learn: 17617.69611