In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost
import math
from scipy.stats import pearsonr
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from feature_engine.encoding import MeanEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import time

In [7]:
#usual xgboost

In [12]:
def drop_null(train, test, min_percent=0.7):
    for feature in train.columns:
        percent = train[feature].isnull().sum() / train.shape[0]
        if (percent > min_percent):
            train = train.drop(columns = feature)
            test = test.drop(columns = feature)

In [13]:
train_data = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')

train_data = train_data.drop(columns=["Id"])
val_ids = test_data["Id"]
test_data = test_data.drop(columns=["Id"])

In [None]:
drop_null(train_data, test_data, 0.95)
for feature in train_data.columns[:-1]:
    if train_data[feature].dtype == 'object':
        train_data[feature] = LabelEncoder().fit_transform(train_data[feature])
        test_data[feature] = LabelEncoder().fit_transform(test_data[feature])
print(train_data)

In [15]:
X = train_data.drop(columns='SalePrice').values
y = train_data['SalePrice'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=98987)

In [18]:
xgbr = xgboost.XGBRegressor(learning_rate=0.01, max_depth=4, n_estimators=1000)
start = time.time()
xgbr.fit(X_train,y_train)
print("time: ", time.time() - start)
print("score: ", xgbr.score(X_test,y_test))
print("RMSE: %.2f" % math.sqrt(np.mean((xgbr.predict(X_test) - y_test) ** 2)))

time:  1.4064102172851562
score:  0.9165953534789814
RMSE: 23614.41


In [None]:
#xgboost with mte

In [28]:
train_data = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')

train_data = train_data.drop(columns=["Id"])
val_ids = test_data["Id"]
test_data = test_data.drop(columns=["Id"])

In [29]:
drop_null(train_data, test_data, 0.7)
cat_features = []
iter = 0
for feature in train_data.columns[:-1]:
    if (train_data[feature].dtype == 'object'): cat_features.append(feature)
    iter +=1
print(len(cat_features), cat_features)

43 ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [30]:
tmp_df = train_data[cat_features]
# tmp_df = SimpleImputer(strategy='most_frequent').fit_transform(tmp_df)
me = MeanEncoder(missing_values='ignore')
# new_X = me.fit_transform(X,y)
me.fit(tmp_df, y)
new_tmp = me.transform(tmp_df)
new_tmp.columns  = cat_features
train_data[cat_features] = new_tmp
print(new_tmp)

           MSZoning         Street  Alley       LotShape    LandContour  \
0     191004.994787  181130.538514    NaN  164754.818378  180183.746758   
1     191004.994787  181130.538514    NaN  164754.818378  180183.746758   
2     191004.994787  181130.538514    NaN  206101.665289  180183.746758   
3     191004.994787  181130.538514    NaN  206101.665289  180183.746758   
4     191004.994787  181130.538514    NaN  206101.665289  180183.746758   
...             ...            ...    ...            ...            ...   
1455  191004.994787  181130.538514    NaN  164754.818378  180183.746758   
1456  191004.994787  181130.538514    NaN  164754.818378  180183.746758   
1457  191004.994787  181130.538514    NaN  164754.818378  180183.746758   
1458  191004.994787  181130.538514    NaN  164754.818378  180183.746758   
1459  191004.994787  181130.538514    NaN  164754.818378  180183.746758   

         Utilities      LotConfig      LandSlope   Neighborhood  \
0     180950.95682  176938.04752



In [31]:
X = train_data.drop(columns='SalePrice').values
y = train_data['SalePrice'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=98987)

In [32]:
xgb_mte = xgboost.XGBRegressor(learning_rate=0.01, max_depth=4, n_estimators=1000)
start = time.time()
xgb_mte.fit(X_train,y_train)
print("time: ", time.time() - start)
print("score: ", xgb_mte.score(X_test,y_test))
print("RMSE: %.2f" % math.sqrt(np.mean((xgb_mte.predict(X_test) - y_test) ** 2)))

time:  1.292036533355713
score:  0.924777656132916
RMSE: 22426.18
