In [143]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [144]:
import numpy as np
import pandas as pd
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score
from torch.nn.functional import softmax
from transformers import BertTokenizer, BertModel
import torch
import re
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [145]:
df_train = pd.read_csv('/content/drive/MyDrive/train_housing.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test_housing.csv')

Here we have every column of the dataset with the explanation.

- MSSubClass: Identifies the type of dwelling involved in the sale. : From 20 to 190
- MSZoning: Identifies the general zoning classification of the sale.
- LotFrontage: Linear feet of street connected to property
- LotArea: Lot size in square feet
- Street: Type of road access to property
- Alley: Type of alley access to property
- LotShape: General shape of property
- LandContour: Flatness of the property
- Utilities: Type of utilities available
- LotConfig: Lot configuration
- LandSlope: Slope of property
- Neighborhood: Physical locations within Ames city limits
- Condition1: Proximity to various conditions
- Condition2: Proximity to various conditions (if more than one is present)
- BldgType: Type of dwelling
- HouseStyle: Style of dwelling
- OverallQual: Rates the overall material and finish of the house
- OverallCond: Rates the overall condition of the house
- YearBuilt: Original construction date
- YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
- RoofStyle: Type of roof
- RoofMatl: Roof material
- Exterior1st: Exterior covering on house
- Exterior2nd: Exterior covering on house (if more than one material)
- MasVnrType: Masonry veneer type
- MasVnrArea: Masonry veneer area in square feet
- ExterQual: Evaluates the quality of the material on the exterior
- ExterCond: Evaluates the present condition of the material on the exterior
- Foundation: Type of foundation
- BsmtQual: Evaluates the height of the basement
- BsmtCond: Evaluates the general condition of the basement
- BsmtExposure: Refers to walkout or garden level walls
- BsmtFinType1: Rating of basement finished area
- BsmtFinSF1: Type 1 finished square feet
- BsmtFinType2: Rating of basement finished area (if multiple types)
- BsmtFinSF2: Type 2 finished square feet
- BsmtUnfSF: Unfinished square feet of basement area
- TotalBsmtSF: Total square feet of basement area
- Heating: Type of heating
- HeatingQC: Heating quality and condition
- CentralAir: Central air conditioning
- Electrical: Electrical system
- 1stFlrSF: First Floor square feet
- 2ndFlrSF: Second floor square feet
- LowQualFinSF: Low quality finished square feet (all floors)
- GrLivArea: Above grade (ground) living area square feet
- BsmtFullBath: Basement full bathrooms
- BsmtHalfBath: Basement half bathrooms
- FullBath: Full bathrooms above grade
- HalfBath: Half baths above grade
- Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
- Kitchen: Kitchens above grade
- KitchenQual: Kitchen quality
- TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
- Functional: Home functionality (Assume typical unless deductions are warranted)
- Fireplaces: Number of fireplaces
- FireplaceQu: Fireplace quality
- GarageType: Garage location
- GarageYrBlt: Year garage was built
- GarageFinish: Interior finish of the garage
- GarageCars: Size of garage in car capacity
- GarageArea: Size of garage in square feet
- GarageQual: Garage quality
- GarageCond: Garage condition
- PavedDrive: Paved driveway
- WoodDeckSF: Wood deck area in square feet
- OpenPorchSF: Open porch area in square feet
- EnclosedPorch: Enclosed porch area in square feet
- 3SsnPorch: Three season porch area in square feet
- ScreenPorch: Screen porch area in square feet
- PoolArea: Pool area in square feet
- PoolQC: Pool quality
- Fence: Fence quality
- MiscFeature: Miscellaneous feature not covered in other categories
- MiscVal: $Value of miscellaneous feature
- MoSold: Month Sold (MM)
- YrSold: Year Sold (YYYY)
- SaleType: Type of sale
- SaleCondition: Condition of sale


Let's start by taking a look at the Dataset :

In [130]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


Let's run some quick statistics of our numerical data :

In [5]:
df_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
df_test.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,439.203704,...,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,421.321334,42.74688,22.376841,4955.517327,1.436812,1.11374,30.390071,21.130467,177.6259,455.268042,...,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [8]:
df_train.nunique()

Unnamed: 0,0
Id,1460
MSSubClass,15
MSZoning,5
LotFrontage,110
LotArea,1073
...,...
MoSold,12
YrSold,5
SaleType,9
SaleCondition,6


In [9]:
df_test.nunique()

Unnamed: 0,0
Id,1459
MSSubClass,16
MSZoning,5
LotFrontage,115
LotArea,1106
...,...
MiscVal,26
MoSold,12
YrSold,5
SaleType,9


In [11]:
df_train_without_label = df_train.drop('SalePrice', axis=1)
train_nunique = df_train_without_label.nunique()
test_nunique = df_test.nunique()

diff_nunique = (train_nunique != test_nunique)

diff_columns = diff_nunique[diff_nunique].index
print("Columns with different unique values between df_train and df_test:")
print(diff_columns)

for col in diff_columns:
    print(f"\nColumn '{col}':")
    print(f" - Unique values in df_train: {train_nunique[col]}")
    print(f" - Unique values in df_test: {test_nunique[col]}")

Columns with different unique values between df_train and df_test:
Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'Utilities', 'Condition2',
       'HouseStyle', 'YearBuilt', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces',
       'GarageCars', 'GarageArea', 'GarageQual', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'MiscFeature', 'MiscVal'],
      dtype='object')

Column 'Id':
 - Unique values in df_train: 1460
 - Unique values in df_test: 1459

Column 'MSSubClass':
 - Unique values in df_train: 15
 - Unique values in df_test: 16

Column 'LotFrontage':
 - Unique values in df_train: 110
 - Unique values in df_test: 115

Column 'LotArea':
 - Unique values in df_train: 1073
 - Unique values in df_test:

In [None]:
for col in df_train.describe(include = 'object').columns:
    print(col)
    print(f"There is " +str(len(df_train[col].unique())) + " distinct entities " +str(col) +".")
    print(df_train[col].unique())
    print('-'*50)

MSZoning
There is 5 distinct entities MSZoning.
['RL' 'RM' 'C (all)' 'FV' 'RH']
--------------------------------------------------
Street
There is 2 distinct entities Street.
['Pave' 'Grvl']
--------------------------------------------------
Alley
There is 3 distinct entities Alley.
[nan 'Grvl' 'Pave']
--------------------------------------------------
LotShape
There is 4 distinct entities LotShape.
['Reg' 'IR1' 'IR2' 'IR3']
--------------------------------------------------
LandContour
There is 4 distinct entities LandContour.
['Lvl' 'Bnk' 'Low' 'HLS']
--------------------------------------------------
Utilities
There is 2 distinct entities Utilities.
['AllPub' 'NoSeWa']
--------------------------------------------------
LotConfig
There is 5 distinct entities LotConfig.
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
--------------------------------------------------
LandSlope
There is 3 distinct entities LandSlope.
['Gtl' 'Mod' 'Sev']
--------------------------------------------------
Neig

Let us now start the pre-processing of our data. We will apply a One-Hot-Encoder to our object colums (columns with categorical informations), except for our columns 'YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold', because we can transform them in a better way.

In [146]:
df_train = pd.read_csv('/content/drive/MyDrive/train_housing.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test_housing.csv')

df_train.drop('Id', axis=1)
df_test.drop('Id', axis=1)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [147]:
exclude_cols = ['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold', 'SalePrice']

numeric_cols_train = df_train.select_dtypes(include=['number']).columns
cols_to_scale_train = [col for col in numeric_cols_train if col not in exclude_cols]

numeric_cols_test = df_test.select_dtypes(include=['number']).columns
cols_to_scale_test = [col for col in numeric_cols_test if col not in exclude_cols]

scaler = MinMaxScaler()
df_train[cols_to_scale_train] = scaler.fit_transform(df_train[cols_to_scale_train])
df_test[cols_to_scale_test] = scaler.transform(df_test[cols_to_scale_test])

df_train['SalePrice_Log'] = np.log(df_train['SalePrice'])

if 'SalePrice' in df_test.columns:
    df_test['SalePrice_Log'] = np.log(df_test['SalePrice'])

for df in [df_train, df_test]:
    df['MoSold_sin'] = np.sin(2 * np.pi * df['MoSold'] / 12)
    df['MoSold_cos'] = np.cos(2 * np.pi * df['MoSold'] / 12)

for df in [df_train, df_test]:
    df['Age_Built'] = df['YrSold'] - df['YearBuilt']
    df['Age_RemodAdd'] = df['YrSold'] - df['YearRemodAdd']
    df['Age_Sold'] = df['YrSold'] - df['YrSold'].min()

df_train = df_train.drop(['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold'], axis=1)
df_test = df_test.drop(['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold'], axis=1)

object_cols = df_train.select_dtypes(include=['object']).columns

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(df_train[object_cols])

encoded_train = encoder.transform(df_train[object_cols])
encoded_test = encoder.transform(df_test[object_cols])

encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(object_cols))
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(object_cols))

df_train = pd.concat([df_train.drop(object_cols, axis=1), encoded_train_df], axis=1)
df_test = pd.concat([df_test.drop(object_cols, axis=1), encoded_test_df], axis=1)

df_test = df_test.reindex(columns=df_train.columns, fill_value=0)

print("Train DataFrame:", df_train.head())
print("Test DataFrame:", df_test.head())


Train DataFrame:          Id  MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  \
0  0.000000    0.235294     0.150685  0.033420     0.666667        0.500   
1  0.000685    0.000000     0.202055  0.038795     0.555556        0.875   
2  0.001371    0.235294     0.160959  0.046507     0.666667        0.500   
3  0.002056    0.294118     0.133562  0.038561     0.666667        0.500   
4  0.002742    0.235294     0.215753  0.060576     0.777778        0.500   

   MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  ...  SaleType_ConLw  \
0     0.12250    0.125089         0.0   0.064212  ...             0.0   
1     0.00000    0.173281         0.0   0.121575  ...             0.0   
2     0.10125    0.086109         0.0   0.185788  ...             0.0   
3     0.00000    0.038271         0.0   0.231164  ...             0.0   
4     0.21875    0.116052         0.0   0.209760  ...             0.0   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_Abnorml  \
0           0.0   

In [148]:
X = df_train.drop(['SalePrice', 'SalePrice_Log'], axis=1)
y = df_train['SalePrice_Log']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [100, 500, 1000],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'alpha': [0, 0.1, 0.2],
    'lambda': [0, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist,
                                   n_iter=10, cv=3, scoring='neg_root_mean_squared_error',
                                   verbose=2, random_state=42)

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)

best_model = random_search.best_estimator_
y_pred_log = best_model.predict(X_valid)

rmse_log = np.sqrt(mean_squared_error(y_valid, y_pred_log))

r2_log = r2_score(y_valid, y_pred_log)

print(f"Log Scale Root Mean Squared Error: {rmse_log}")
print(f"R^2 Score (log scale): {r2_log}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END alpha=0.1, colsample_bytree=1.0, gamma=0.1, lambda=0.2, learning_rate=0.05, max_depth=9, n_estimators=1000, subsample=0.8; total time=   4.5s
[CV] END alpha=0.1, colsample_bytree=1.0, gamma=0.1, lambda=0.2, learning_rate=0.05, max_depth=9, n_estimators=1000, subsample=0.8; total time=   4.8s
[CV] END alpha=0.1, colsample_bytree=1.0, gamma=0.1, lambda=0.2, learning_rate=0.05, max_depth=9, n_estimators=1000, subsample=0.8; total time=   5.9s
[CV] END alpha=0, colsample_bytree=0.6, gamma=0.1, lambda=0.2, learning_rate=0.2, max_depth=9, n_estimators=500, subsample=1.0; total time=   2.9s
[CV] END alpha=0, colsample_bytree=0.6, gamma=0.1, lambda=0.2, learning_rate=0.2, max_depth=9, n_estimators=500, subsample=1.0; total time=   1.3s
[CV] END alpha=0, colsample_bytree=0.6, gamma=0.1, lambda=0.2, learning_rate=0.2, max_depth=9, n_estimators=500, subsample=1.0; total time=   0.9s
[CV] END alpha=0.1, colsample_bytree=0.8, gam

In [149]:
X = df_train.drop(['SalePrice', 'SalePrice_Log'], axis=1)
y = df_train['SalePrice_Log']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

params = {
    'learning_rate': 0.01,
    'n_estimators': 3460,
    'max_depth': 3,
    'min_child_weight': 0,
    'gamma': 0,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:squarederror',
    'scale_pos_weight': 1,
    'seed': 27,
    'reg_alpha': 0.00006
}

evals = [(dtrain, 'train'), (dvalid, 'valid')]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=5000,
    evals=evals,
    early_stopping_rounds=500,
    verbose_eval=True
)

y_pred_log = model.predict(dvalid, iteration_range=(0, model.best_iteration))
rmse = np.sqrt(mean_squared_error(y_valid, y_pred_log))
r2 = r2_score(y_valid, y_pred_log)

print(f"Root Mean Squared Error: {rmse}")
print(f"R^2 Score: {r2}")

[0]	train-rmse:0.38777	valid-rmse:0.43049
[1]	train-rmse:0.38536	valid-rmse:0.42799
[2]	train-rmse:0.38276	valid-rmse:0.42536
[3]	train-rmse:0.38017	valid-rmse:0.42268
[4]	train-rmse:0.37765	valid-rmse:0.42005
[5]	train-rmse:0.37514	valid-rmse:0.41741
[6]	train-rmse:0.37259	valid-rmse:0.41472
[7]	train-rmse:0.37012	valid-rmse:0.41214
[8]	train-rmse:0.36781	valid-rmse:0.40981
[9]	train-rmse:0.36535	valid-rmse:0.40727
[10]	train-rmse:0.36288	valid-rmse:0.40473
[11]	train-rmse:0.36069	valid-rmse:0.40251
[12]	train-rmse:0.35840	valid-rmse:0.40011
[13]	train-rmse:0.35617	valid-rmse:0.39778
[14]	train-rmse:0.35395	valid-rmse:0.39546
[15]	train-rmse:0.35168	valid-rmse:0.39304
[16]	train-rmse:0.34936	valid-rmse:0.39059
[17]	train-rmse:0.34712	valid-rmse:0.38833
[18]	train-rmse:0.34500	valid-rmse:0.38616
[19]	train-rmse:0.34279	valid-rmse:0.38398
[20]	train-rmse:0.34060	valid-rmse:0.38176
[21]	train-rmse:0.33845	valid-rmse:0.37947
[22]	train-rmse:0.33630	valid-rmse:0.37716
[23]	train-rmse:0.334

Parameters: { "n_estimators" } are not used.



[24]	train-rmse:0.33213	valid-rmse:0.37283
[25]	train-rmse:0.33006	valid-rmse:0.37059
[26]	train-rmse:0.32794	valid-rmse:0.36828
[27]	train-rmse:0.32608	valid-rmse:0.36636
[28]	train-rmse:0.32399	valid-rmse:0.36420
[29]	train-rmse:0.32202	valid-rmse:0.36207
[30]	train-rmse:0.32007	valid-rmse:0.36003
[31]	train-rmse:0.31819	valid-rmse:0.35802
[32]	train-rmse:0.31622	valid-rmse:0.35598
[33]	train-rmse:0.31433	valid-rmse:0.35393
[34]	train-rmse:0.31242	valid-rmse:0.35190
[35]	train-rmse:0.31050	valid-rmse:0.34979
[36]	train-rmse:0.30866	valid-rmse:0.34782
[37]	train-rmse:0.30691	valid-rmse:0.34603
[38]	train-rmse:0.30512	valid-rmse:0.34423
[39]	train-rmse:0.30323	valid-rmse:0.34229
[40]	train-rmse:0.30144	valid-rmse:0.34041
[41]	train-rmse:0.29969	valid-rmse:0.33871
[42]	train-rmse:0.29790	valid-rmse:0.33686
[43]	train-rmse:0.29613	valid-rmse:0.33512
[44]	train-rmse:0.29447	valid-rmse:0.33340
[45]	train-rmse:0.29280	valid-rmse:0.33167
[46]	train-rmse:0.29113	valid-rmse:0.32992
[47]	train-

Now that we have trained our model, let's make predictions for our dataset sample :

In [150]:
df_test.drop(['SalePrice', 'SalePrice_Log'], axis=1, inplace=True)
dtest = xgb.DMatrix(df_test)

y_pred_log_test = model.predict(dtest, iteration_range=(0, model.best_iteration))

df_test['Predicted_SalePrice'] = y_pred_log_test

In [151]:
df_real = pd.read_csv('/content/drive/MyDrive/sample_submission_housing.csv')

In [152]:
df_real['SalePrice_Log'] = np.log(df_real['SalePrice'])

In [153]:
rmse = np.sqrt(mean_squared_error(df_real['SalePrice_Log'], df_test['Predicted_SalePrice']))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 0.37912657165437386
