In [358]:
import numpy as np
import pandas as pd
import gc
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression, SGDRegressor, Ridge
from sklearn.svm import SVR
from pandas.api.types import CategoricalDtype

from IPython.display import display  # Allows the use of display() for DataFrames

import warnings

warnings.filterwarnings('ignore')

# Скачиваем датасэты

In [359]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [360]:
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [361]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [362]:
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


# EDA

##  Проверяем полноту данных (фитчи) и чистим датасет

In [363]:
train_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [364]:
train_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [365]:
def clean_df(df):
    df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1,
            inplace=True)  ## Удаляем фитчи где больщая часть информации отсутсвует
    df['MSZoning'] = df['MSZoning'].replace({'C (all)': 'C'})
    df['Exterior1st'] = df['Exterior1st'].replace({'Wd Sdng': 'Wd_Sdng'})
    df['Exterior2nd'] = df['Exterior2nd'].replace({'Wd Sdng': 'Wd_Sdng'})
    df['GarageYrBlt'] = df['GarageYrBlt'].where(df.GarageYrBlt <= 2010,
                                                df.YearBuilt)  # у испорченных значений заменяем год постройки гаража на год     постройки дома
    df.rename(columns={
        '1stFlrSF': 'FirstFlrSF',
        '2ndFlrSF': 'SecondFlrSF',
        '3SsnPorch': 'Threeseasonporch'}, inplace=True, )
    return df

In [366]:
# The nominative (unordered) categorical features
features_nom = ['MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1',
                'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                'MasVnrType',
                'Foundation', 'Heating', 'CentralAir', 'GarageType', 'SaleType', 'SaleCondition']

# Pandas calls the categories "levels"
five_feature = ['Po', 'Fa', 'Ta', 'Gd', 'Ex']
ten_feature = list(range(10))

ordered_levels = {
    'OverallQual': ten_feature,
    'OverallCond': ten_feature,
    'ExterQual': five_feature,
    'ExterCond': five_feature,
    'BsmtQual': five_feature,
    'BsmtCond': five_feature,
    'HeatingQC': five_feature,
    'KitchenQual': five_feature,
    'FireplaceQu': five_feature,
    'GarageQual': five_feature,
    'GarageCond': five_feature,

    'LotShape': ['Reg', 'IR1', 'IR2', 'IR3'],
    'LandSlope': ['Sev', 'Mod', 'Gtl'],
    'BsmtExposure': ['No', 'Mn', 'Av', 'Gd'],
    'BsmtFinType1': ['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'BsmtFinType2': ['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'Functional': ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    'GarageFinish': ['Unf', 'RFn', 'Fin'],
    'PavedDrive': ['N', 'P', 'Y'],
    'Utilities': ['NoSeWa', 'ELO', 'NoSewr', 'AllPub'],
    'CentralAir': ['N', 'Y'],
    'Electrical': ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'],

}

# Add a None level for missing values
ordered_levels = {key: ['None'] + value for key, value in ordered_levels.items()}


# Defining encoding function
def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype('category')
        # Add a None category for missing values
        if 'None' not in df[name].cat.categories:
            df[name].cat.add_categories('None', inplace=True)

    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels, ordered=True))
    return df

In [367]:
def impute(df):
    for name in df.select_dtypes('number'):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes('category'):
        df[name] = df[name].fillna('None')
    return df

In [368]:
def all_data(df):
    clean_df(df)
    encode(df)
    impute(df)
    return df

In [369]:
train_df = all_data(train_df)
test_df = all_data(test_df)


In [370]:
display(train_df)
print(' ')
print('-' * 20)
print(' ')
display(test_df)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,Threeseasonporch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,...,112,0,0,0,0,4,2010,WD,Normal,142125


 
--------------------
 


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,Threeseasonporch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,120,0,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,...,36,0,0,0,0,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,IR1,Lvl,AllPub,Inside,...,34,0,0,0,0,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,IR1,Lvl,AllPub,Inside,...,36,0,0,0,0,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,IR1,HLS,AllPub,Inside,...,82,0,0,144,0,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,Reg,Lvl,AllPub,Inside,...,24,0,0,0,0,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,Reg,Lvl,AllPub,Inside,...,32,0,0,0,0,700,7,2006,WD,Normal


In [371]:
train_df_dum = pd.get_dummies(train_df)

In [372]:
test_df_dum = pd.get_dummies(test_df)

In [373]:
test_df_dum.columns.nunique()

344

In [374]:
final_df = pd.concat([train_df, test_df])

In [375]:
final_df_dum = pd.get_dummies(final_df)

In [376]:
fin_train = final_df_dum.iloc[:len(train_df.index)]
fin_test = final_df_dum.iloc[len(test_df.index) + 1:]

In [377]:
fin_train['Bath_total'] = fin_train['FullBath'] + 0.5 * fin_train['HalfBath'] + fin_train['BsmtFullBath'] + 0.5 *fin_train['BsmtHalfBath']
fin_test['Bath_total'] = fin_test['FullBath'] + 0.5 * fin_test['HalfBath'] + fin_test['BsmtFullBath'] + 0.5 * fin_test[
    'BsmtHalfBath']

In [378]:
fin_train['BsmtFinSF'] = fin_train['BsmtFinSF1'] + fin_train['BsmtFinSF2'] - fin_train['BsmtUnfSF']
fin_test['BsmtFinSF'] = fin_test['BsmtFinSF1'] + fin_test['BsmtFinSF2'] - fin_test['BsmtUnfSF']

In [379]:
fin_train['Porch'] = fin_train['OpenPorchSF'] + fin_train['EnclosedPorch'] + fin_train['Threeseasonporch'] + fin_train[
    'ScreenPorch']
fin_test['Porch'] = fin_test['OpenPorchSF'] + fin_test['EnclosedPorch'] + fin_test['Threeseasonporch'] + fin_test[
    'ScreenPorch']

In [380]:
fin_train['FinSF'] = fin_train['FirstFlrSF'] + fin_train['SecondFlrSF'] - fin_train['LowQualFinSF']
fin_test['FinSF'] = fin_test['FirstFlrSF'] + fin_test['SecondFlrSF'] - fin_test['LowQualFinSF']

In [381]:
columns_to_drop = ['PoolArea', 'MiscVal',
                   'WoodDeckSF',
                   'Fireplaces', 'MasVnrArea',
                   'FullBath', 'HalfBath', 'BsmtFullBath',
                   'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2',
                   'BsmtUnfSF', 'OpenPorchSF', 'EnclosedPorch',
                   'Threeseasonporch', 'ScreenPorch',
                   'FirstFlrSF', 'SecondFlrSF', 'LowQualFinSF']

fin_train = fin_train.drop(columns_to_drop, axis=1)
fin_test = fin_test.drop(columns_to_drop, axis=1)

In [382]:
from datetime import datetime
year_col = ['YrSold','YearBuilt','YearRemodAdd', 'GarageYrBlt']

for col in year_col:
    fin_train[col]  =  round(fin_train[col]/5)*5
    fin_test[col]  =  round(fin_test[col]/5)*5

In [383]:
X = fin_train.drop(["SalePrice", ], axis=1)
y = np.log1p(train_df["SalePrice"].values)

test_pred = fin_test.drop(['SalePrice'], axis=1)

In [384]:
Scaler = StandardScaler()
fin_train_scale = Scaler.fit_transform(X)
fin_test_scale = Scaler.transform(test_pred)

In [385]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [386]:
xgb = XGBRegressor(objective='reg:linear',
                   eval_metric='rmse',
                   eta=0.001,
                   max_depth=10,
                   subsample=0.6,
                   colsample_bytree=0.6,
                   alpha=0.001,
                   random_state=17,
                   silent=True,
                   n_estimators=15000,
                   n_jobs=-1,
                   seed=17
                   )

In [387]:
xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=30)

Parameters: { "silent" } are not used.

[0]	validation_0-rmse:11.51114
[30]	validation_0-rmse:11.17145
[60]	validation_0-rmse:10.84183
[90]	validation_0-rmse:10.52187
[120]	validation_0-rmse:10.21128
[150]	validation_0-rmse:9.90998
[180]	validation_0-rmse:9.61752
[210]	validation_0-rmse:9.33376
[240]	validation_0-rmse:9.05849
[270]	validation_0-rmse:8.79131
[300]	validation_0-rmse:8.53202
[330]	validation_0-rmse:8.28047
[360]	validation_0-rmse:8.03622
[390]	validation_0-rmse:7.79920
[420]	validation_0-rmse:7.56937
[450]	validation_0-rmse:7.34622
[480]	validation_0-rmse:7.12975
[510]	validation_0-rmse:6.91955
[540]	validation_0-rmse:6.71561
[570]	validation_0-rmse:6.51780
[600]	validation_0-rmse:6.32574
[630]	validation_0-rmse:6.13943
[660]	validation_0-rmse:5.95869
[690]	validation_0-rmse:5.78338
[720]	validation_0-rmse:5.61325
[750]	validation_0-rmse:5.44826
[780]	validation_0-rmse:5.28811
[810]	validation_0-rmse:5.13282
[840]	validation_0-rmse:4.98202
[870]	validation_0-rmse:4.83579


In [343]:
lgb = LGBMRegressor(objective='regression', num_leaves=100, learning_rate=0.001, bagging_fraction=0.6,
                    feature_fraction=0.6, bagging_frequency=6, bagging_seed=42, verbosity=-1, random_state=17,
                    n_jobs=-1, metric='rmse', n_estimators=15000, seed=42)

In [344]:
lgb.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_test, y_test)], early_stopping_rounds=100,
        verbose=150)

[150]	valid_0's rmse: 0.369639
[300]	valid_0's rmse: 0.332686
[450]	valid_0's rmse: 0.301476
[600]	valid_0's rmse: 0.275055
[750]	valid_0's rmse: 0.252825
[900]	valid_0's rmse: 0.234368
[1050]	valid_0's rmse: 0.218897
[1200]	valid_0's rmse: 0.205961
[1350]	valid_0's rmse: 0.195186
[1500]	valid_0's rmse: 0.186091
[1650]	valid_0's rmse: 0.178433
[1800]	valid_0's rmse: 0.172145
[1950]	valid_0's rmse: 0.166898
[2100]	valid_0's rmse: 0.162457
[2250]	valid_0's rmse: 0.15883
[2400]	valid_0's rmse: 0.155707
[2550]	valid_0's rmse: 0.153096
[2700]	valid_0's rmse: 0.150682
[2850]	valid_0's rmse: 0.148713
[3000]	valid_0's rmse: 0.14708
[3150]	valid_0's rmse: 0.145629
[3300]	valid_0's rmse: 0.144412
[3450]	valid_0's rmse: 0.143419
[3600]	valid_0's rmse: 0.142588
[3750]	valid_0's rmse: 0.141886
[3900]	valid_0's rmse: 0.141325
[4050]	valid_0's rmse: 0.140851
[4200]	valid_0's rmse: 0.140461
[4350]	valid_0's rmse: 0.140106
[4500]	valid_0's rmse: 0.13984
[4650]	valid_0's rmse: 0.139621
[4800]	valid_0's 

In [345]:
cat = CatBoostRegressor(iterations=5000,
                        learning_rate=0.003,
                        depth=9,
                        eval_metric='RMSE',
                        random_seed=42,
                        bagging_temperature=0.3,
                        od_type='Iter',
                        metric_period=50,
                        od_wait=20)

In [346]:
cat.fit(X_train, y_train, verbose=50, eval_set=(X_test, y_test), early_stopping_rounds=100,
        use_best_model=True)



0:	learn: 0.3916359	test: 0.4122661	best: 0.4122661 (0)	total: 33.8ms	remaining: 2m 48s
50:	learn: 0.3577931	test: 0.3786693	best: 0.3786693 (50)	total: 970ms	remaining: 1m 34s
100:	learn: 0.3281194	test: 0.3492186	best: 0.3492186 (100)	total: 1.92s	remaining: 1m 32s
150:	learn: 0.3022510	test: 0.3238869	best: 0.3238869 (150)	total: 2.9s	remaining: 1m 33s
200:	learn: 0.2794279	test: 0.3016339	best: 0.3016339 (200)	total: 3.88s	remaining: 1m 32s
250:	learn: 0.2594732	test: 0.2821175	best: 0.2821175 (250)	total: 4.8s	remaining: 1m 30s
300:	learn: 0.2418744	test: 0.2652476	best: 0.2652476 (300)	total: 5.68s	remaining: 1m 28s
350:	learn: 0.2262440	test: 0.2504894	best: 0.2504894 (350)	total: 6.57s	remaining: 1m 26s
400:	learn: 0.2126443	test: 0.2376979	best: 0.2376979 (400)	total: 7.4s	remaining: 1m 24s
450:	learn: 0.2005607	test: 0.2267525	best: 0.2267525 (450)	total: 8.27s	remaining: 1m 23s
500:	learn: 0.1899424	test: 0.2173385	best: 0.2173385 (500)	total: 9.08s	remaining: 1m 21s
550:	le

<catboost.core.CatBoostRegressor at 0x2662c46be80>

In [400]:
pred_test_XGB = np.expm1(xgb.predict(test_pred))

In [401]:
pred_test_LGBM = np.expm1(lgb.predict(test_pred.drop(['Id'],axis=1)))

In [402]:
pred_test_cat = np.expm1(cat.predict(test_pred))

In [403]:
pred_test_XGB

array([117999.81 , 152770.11 , 181946.95 , ..., 138138.8  , 117852.484,
       209081.56 ], dtype=float32)

In [390]:
pd.DataFrame({"Id": test_pred.iloc[:,0].tolist(), "SalePrice":pred_test_XGB}).to_csv("submission1.csv", index=False)


In [391]:
pd.read_csv('submission1.csv')

Unnamed: 0,Id,SalePrice
0,1461,117999.810
1,1462,152770.110
2,1463,181946.950
3,1464,192948.750
4,1465,185295.780
...,...,...
1454,2915,84378.970
1455,2916,85477.810
1456,2917,138138.800
1457,2918,117852.484


In [411]:
sub = pd.read_csv('sample_submission.csv')

In [412]:
sub_lgb = pd.DataFrame()
sub_lgb["SalePrice"] = pred_test_LGBM

sub_xgb = pd.DataFrame()
sub_xgb["SalePrice"] = pred_test_XGB

sub_cat = pd.DataFrame()
sub_cat["SalePrice"] = pred_test_cat

In [413]:
sub["SalePrice"] = (sub_xgb["SalePrice"] * 0.5 + sub_lgb["SalePrice"] * 0.25 + sub_cat["SalePrice"] * 0.25)

In [409]:
sub

Unnamed: 0,Id,SalePrice
0,1461,123656.022333
1,1462,156443.853219
2,1463,183886.397025
3,1464,193472.604919
4,1465,187069.297212
...,...,...
1454,2915,85609.122414
1455,2916,87905.379693
1456,2917,143316.219940
1457,2918,119816.811543


In [414]:
print(sub.head())
sub.to_csv('sub_final.csv', index=False)

     Id      SalePrice
0  1461  122334.804773
1  1462  155642.868431
2  1463  183539.588490
3  1464  193486.891010
4  1465  186766.359933


In [205]:
sub2 = pd.read_csv("sub1_xgb.csv")

In [210]:
sub2

Unnamed: 0,Id,SalePrice
0,1461,191089.69
1,1462,179308.98
2,1463,183799.61
3,1464,190811.80
4,1465,198641.05
...,...,...
1454,2915,230064.40
1455,2916,172825.97
1456,2917,191297.38
1457,2918,171318.05
