In [50]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from data_processing import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from data_transformar import *
import lightgbm as lgb
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor, Pool

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set(style="darkgrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
from sklearn.metrics import mean_squared_error

cheat = pd.read_csv("./result-with-best.csv")

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cheat_score(model, val):
    print("RMSLE sub: " + str(rmsle(model.predict(val), np.log1p(cheat["SalePrice"]))))

def to_categorical(X):
    for c in X.columns:
        col_type = X[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            X[c] = X[c].astype('category')

def evaluate(model, X, y):
    preds = model.predict(X)
    print("RMSLE: " + str(rmsle(preds, y)))

In [63]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

data = remove_outliers(data)

y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)
X_train_orig = X_train.copy()

transformer = DataTransformer(StandardScaler())

X_train = transformer.prepare(X_train)
X_test = transformer.prepare(X_test)

transformer.fit(X_train)

X_train = transformer.transform(X_train, False)
X_test = transformer.transform(X_test, False)

Int64Index([249, 313, 335, 523, 706, 1298], dtype='int64')


In [64]:
cat_features = np.where(X_train.loc[:, X_train.columns.values].dtypes == "object")[0]
X_train.iloc[cat_features]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalSF,Total_sqr_footage,Total_porch_sf
1410,60.0,RL,79.0,12420.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,6,2009,WD,Normal,2784.0,2506.0,45.0
723,50.0,RL,60.0,8172.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,5,2008,WD,Normal,2411.0,1470.0,156.0
1371,80.0,RL,80.0,9600.0,Pave,,Reg,Lvl,Inside,Gtl,...,MnPrv,,0.0,10,2008,WD,Normal,2653.0,2492.0,206.0
1440,70.0,RL,79.0,11526.0,Pave,,IR1,Bnk,Inside,Mod,...,,,0.0,9,2008,WD,Normal,2759.0,2171.0,431.0
383,45.0,RH,60.0,9000.0,Pave,,Reg,Lvl,Corner,Gtl,...,,,0.0,10,2009,WD,Normal,1568.0,784.0,91.0
360,85.0,RL,41.18882,7540.0,Pave,,IR1,Lvl,CulDSac,Gtl,...,MnPrv,,0.0,6,2007,WD,Normal,1800.0,1685.0,192.0
219,120.0,RL,43.0,3010.0,Pave,,Reg,Lvl,Inside,Gtl,...,,,0.0,3,2006,New,Partial,2496.0,1264.0,108.0
513,20.0,RL,71.0,9187.0,Pave,,Reg,Bnk,Corner,Gtl,...,,,0.0,6,2007,WD,Normal,2164.0,1416.0,278.0
1170,80.0,RL,76.0,9880.0,Pave,,Reg,Lvl,Inside,Gtl,...,GdPrv,,0.0,7,2008,WD,Normal,2214.0,1640.0,203.0
1227,20.0,RL,72.0,8872.0,Pave,,Reg,Lvl,Corner,Gtl,...,,,0.0,12,2008,WD,Normal,1824.0,1507.0,240.0


In [65]:
train_pool = Pool(X_train, y_train, cat_features=cat_features)
dev_pool = Pool(X_test, y_test, cat_features=cat_features)

cat_model = CatBoostRegressor(task_type="GPU",  devices='0:1')
cat_model.fit(train_pool, eval_set=dev_pool, verbose=0, plot=False)

evaluate(cat_model, X_train, y_train)
evaluate(cat_model, X_test, y_test)

RMSLE: 0.08742266027601715
RMSLE: 0.11719718637605857


In [66]:
data = pd.read_csv("./train.csv")
data = data.drop(columns=["Id"])

#data = remove_outliers(data)

y = np.log1p(data["SalePrice"])
X = data.drop(columns=["SalePrice"])

transformer = DataTransformer(StandardScaler())
X = transformer.prepare(X)

transformer.fit(X)

X = transformer.transform(X, False)

cat_features = np.where(X.loc[:, X.columns.values].dtypes == "object")[0]

In [75]:
train_pool = Pool(X, y, cat_features=cat_features)
#400 - 0.12927241949496218
cat_model = CatBoostRegressor(iterations=300, task_type="GPU",  devices='0:1')
cat_model.fit(train_pool, verbose=0, plot=False)

evaluate(cat_model, X, y)

RMSLE: 0.07286450550900356


In [76]:
validation = pd.read_csv("./test.csv")
val_ids = validation["Id"]
validation = validation.drop(columns=["Id"])

validation = transformer.prepare(validation)
validation = transformer.transform(validation, False)
#to_categorical(validation)

sub_predictions = cat_model.predict(validation)
print("RMSLE submission: " + str(rmsle(sub_predictions, np.log1p(cheat["SalePrice"]))))
#evaluate(lbg_model, validation,  np.log1p(cheat["SalePrice"]))

RMSLE submission: 0.12658064177361
