In [1]:
# Load all necessary libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200)

from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

In [12]:
# Read train and test data
hp_train = pd.read_csv('train.csv', index_col='Id')
hp_test = pd.read_csv('test.csv', index_col='Id')

In [3]:
# Get x and y from train data
X = hp_train.drop('SalePrice', axis=1)
y = hp_train['SalePrice']

In [5]:
# Initialize function for data preparetion
def prep_data(data):
    # Separete numeric and categorical columns
    num_col = data.dtypes[data.dtypes != 'object'].index.to_list()
    cat_col = data.dtypes[data.dtypes == 'object'].index.to_list()
    # Get all columns with missing values
    num_col_mis_val = data[num_col].dtypes[data[num_col].isna().sum()>0].index.to_list()
    cat_col_mis_val = data[cat_col].dtypes[data[cat_col].isna().sum()>0].index.to_list()
    # Fill all numerical values with mean
    data[num_col_mis_val] = SimpleImputer(strategy='mean').fit_transform(data[num_col_mis_val])
    # Fill all categorical values with "Missing"
    data[cat_col_mis_val] = data[cat_col_mis_val].fillna('Missing')
    # Scale all numerical columns
    data[num_col] = RobustScaler().fit_transform(data[num_col])
    # Encode all categorical columns
    data[cat_col] = data[cat_col].apply(lambda x: pd.factorize(x)[0])
    return data

In [6]:
# Prepare train data
X = prep_data(X)
# Scale target values
y = np.log1p(y)

In [7]:
# Split x and y into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [43]:
'''parameters = {
    "n_estimators": [500, 750, 1000, 1500, 2000], 
    "learning_rate": [0.01, 0.02, 0.05], 
    "max_depth": [6, 8], 
    "subsample": [0.3, 0.5, 0.7]
}

grid = GridSearchCV(XGBRegressor(objective='reg:squarederror'), parameters)
grid.fit(X_train, y_train)

print(grid.best_params_)'''

{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 2000, 'subsample': 0.7}


In [8]:
# Define and fit the XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', n_jobs=8, learning_rate=0.01, max_depth=6, n_estimators=1500,
subsample=0.7, early_stopping_rounds=10).fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [9]:
# Print the results of traning
print("-----")
print("* Training set")
y_pred = xgb_model.predict(X_train)
print(f"R2: {r2_score(y_train, y_pred):.2%}")
print(f"RMSE: {mean_squared_log_error(y_train, y_pred, squared=False):.5f}")

print("-----")
print("* Validation set")
y_pred = xgb_model.predict(X_test)
print(f"R2: {r2_score(y_test, y_pred):.2%}")
print(f"RMSE: {mean_squared_log_error(y_test, y_pred, squared=False):.5f}")

-----
* Training set
R2: 99.02%
RMSE: 0.00308
-----
* Validation set
R2: 89.42%
RMSE: 0.00959


In [13]:
# Prepare test data
X_val = prep_data(hp_test)

In [16]:
# Make predictions
predictions = xgb_model.predict(X_val)

In [17]:
predictions

array([11.792274, 12.010835, 12.121516, ..., 12.082198, 11.654733,
       12.340455], dtype=float32)

In [18]:
# Inverse predictions
predictions = np.expm1(predictions)

In [19]:
predictions

array([132225.88, 164526.78, 183782.97, ..., 176697.14, 115234.45,
       228765.03], dtype=float32)

In [20]:
# Read sample_submission.csv
submission = pd.read_csv('sample_submission.csv')

In [22]:
# Set sale price to predicted values
submission.SalePrice = predictions

In [25]:
# Save submission into a .csv file
submission.to_csv('submission.csv', header=True, index=False)