In [1]:
import pandas
import numpy

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor

In [2]:
train_data = pandas.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pandas.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [3]:
train_labels = train_data.pop("SalePrice")

In [4]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [5]:
train_data.fillna("0", inplace=True)
test_data.fillna("0", inplace=True)

In [6]:
encoder = LabelEncoder()

for column in train_data.select_dtypes(exclude=["number"]).columns.intersection(test_data.select_dtypes(exclude=["number"]).columns):
    encoder.fit_transform(train_data[column].to_list() + test_data[column].to_list())
    
    train_data[column] = encoder.transform(train_data[column])
    test_data[column] = encoder.transform(test_data[column])
    
train_data = train_data.astype(float)
test_data = test_data.astype(float)

In [7]:
scaler = StandardScaler()

train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

In [8]:
model = XGBRegressor(tree_method="gpu_hist")

In [9]:
xgb_parameters = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    "alpha": [5, 10, 15],
    "colsample_bytree": [0.1, 0.3, 0.5]
}

In [10]:
xgbr_search = GridSearchCV(model, xgb_parameters, cv=5, scoring='neg_mean_squared_error')
xgbr_search.fit(train_data, train_labels)

In [11]:
best_model = xgbr_search.best_estimator_

In [12]:
prediction = best_model.predict(test_data)

In [13]:
result = pandas.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
result["SalePrice"] = prediction

In [14]:
result.to_csv("/kaggle/working/result.csv", index=False)