In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import numpy as np
color_palette = sns.color_palette("husl")
import random
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import  mean_squared_error

In [25]:
import pandas as pd
import matplotlib
import plotly
import seaborn
import numpy
import sklearn
import xgboost

print(f'pandas: {pd.__version__}')
print(f'matplotlib: {matplotlib.__version__}')
print(f'plotly: {plotly.__version__}')
print(f'seaborn: {seaborn.__version__}')
print(f'numpy: {numpy.__version__}')
print(f'scikit-learn: {sklearn.__version__}')
print(f'xgboost: {xgboost.__version__}')


pandas: 2.0.3
matplotlib: 3.5.1
plotly: 5.9.0
seaborn: 0.11.2
numpy: 1.22.3
scikit-learn: 1.1.3
xgboost: 1.7.6


In [26]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [27]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [28]:
def convert_features(data):
  data["MSSubClass"] = data["MSSubClass"].astype(str)
  data["YearBuilt_obj"] = data["YearBuilt"].astype(str)
  data["YearRemodAdd_obj"] = data["YearRemodAdd"].astype(str)
  data["MoSold_obj"] = data["MoSold"].astype(str)
  data["YrSold_obj"] = data["YrSold"].astype(str)
  data["Bldg_Age"] =  data["YrSold"] - data["YearBuilt"]
  return data


In [29]:
convert_features(train)
convert_features(test)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,YearBuilt_obj,YearRemodAdd_obj,MoSold_obj,YrSold_obj,Bldg_Age
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,6,2010,WD,Normal,1961,1961,6,2010,49
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,12500,6,2010,WD,Normal,1958,1958,6,2010,52
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,3,2010,WD,Normal,1997,1998,3,2010,13
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,6,2010,WD,Normal,1998,1998,6,2010,12
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,1,2010,WD,Normal,1992,1992,1,2010,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,6,2006,WD,Normal,1970,1970,6,2006,36
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,4,2006,WD,Abnorml,1970,1970,4,2006,36
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,9,2006,WD,Abnorml,1960,1996,9,2006,46
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,700,7,2006,WD,Normal,1992,1992,7,2006,14


In [30]:
def feature_engineering(data):
  data["TotalSF"] = data["1stFlrSF"] + data["2ndFlrSF"] + data["TotalBsmtSF"]
  data["TotalFullBaths"] = data["FullBath"] + data["BsmtFullBath"]
  data["TotalHalfBaths"] = data["HalfBath"] + data["BsmtHalfBath"]
  data["TotalPorch"] = data["OpenPorchSF"] + data["EnclosedPorch"] + data["3SsnPorch"] + data["ScreenPorch"]

  data['HasRemodeled'] = (data['YearRemodAdd'] != data['YearBuilt']).astype(object)
  data['Has2ndFloor'] = (data['2ndFlrSF'] > 0).astype(object)
  data['HasGarage'] = (data['GarageArea'] > 0).astype(object)
  return data

In [31]:
feature_train = feature_engineering(train)
feature_test = feature_engineering(test)

In [32]:
num_cols = [col for col in train.columns if train[col].dtype in ["int64", "float32"]]
cat_cols = [col for col in train.columns if train[col].dtype in ["object"]]

num_train = feature_train[num_cols]

In [33]:
df = feature_train[['YearBuilt', 'Bldg_Age', 'OverallQual', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'TotalSF', 'PoolArea', 'TotalFullBaths','Neighborhood','BldgType', 'HouseStyle','Foundation', 'Electrical', 'Heating','GarageType', 'SalePrice']]
df.head()

Unnamed: 0,YearBuilt,Bldg_Age,OverallQual,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,TotalSF,PoolArea,TotalFullBaths,Neighborhood,BldgType,HouseStyle,Foundation,Electrical,Heating,GarageType,SalePrice
0,2003,5,7,856,856,854,1710,548,2566,0,3,CollgCr,1Fam,2Story,PConc,SBrkr,GasA,Attchd,208500
1,1976,31,6,1262,1262,0,1262,460,2524,0,2,Veenker,1Fam,1Story,CBlock,SBrkr,GasA,Attchd,181500
2,2001,7,7,920,920,866,1786,608,2706,0,3,CollgCr,1Fam,2Story,PConc,SBrkr,GasA,Attchd,223500
3,1915,91,7,756,961,756,1717,642,2473,0,2,Crawfor,1Fam,2Story,BrkTil,SBrkr,GasA,Detchd,140000
4,2000,8,8,1145,1145,1053,2198,836,3343,0,3,NoRidge,1Fam,2Story,PConc,SBrkr,GasA,Attchd,250000


In [47]:
len(['YearBuilt', 'Bldg_Age', 'OverallQual', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'TotalSF', 'PoolArea', 'TotalFullBaths','Neighborhood','BldgType', 'HouseStyle','Foundation', 'Electrical', 'Heating','GarageType', 'SalePrice'])

19

In [34]:
df_test = feature_test[['YearBuilt', 'Bldg_Age', 'OverallQual', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'TotalSF', 'PoolArea', 'TotalFullBaths','Neighborhood','BldgType', 'HouseStyle','Foundation', 'Electrical', 'Heating','GarageType']]
df_test.head()

Unnamed: 0,YearBuilt,Bldg_Age,OverallQual,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,TotalSF,PoolArea,TotalFullBaths,Neighborhood,BldgType,HouseStyle,Foundation,Electrical,Heating,GarageType
0,1961,49,5,882.0,896,0,896,730.0,1778.0,0,1.0,NAmes,1Fam,1Story,CBlock,SBrkr,GasA,Attchd
1,1958,52,6,1329.0,1329,0,1329,312.0,2658.0,0,1.0,NAmes,1Fam,1Story,CBlock,SBrkr,GasA,Attchd
2,1997,13,5,928.0,928,701,1629,482.0,2557.0,0,2.0,Gilbert,1Fam,2Story,PConc,SBrkr,GasA,Attchd
3,1998,12,6,926.0,926,678,1604,470.0,2530.0,0,2.0,Gilbert,1Fam,2Story,PConc,SBrkr,GasA,Attchd
4,1992,18,8,1280.0,1280,0,1280,506.0,2560.0,0,2.0,StoneBr,TwnhsE,1Story,PConc,SBrkr,GasA,Attchd


In [35]:
df.dropna(axis = 0, subset = ["SalePrice"], inplace = True)
y = df["SalePrice"]
y = np.log(y)
df.drop(["SalePrice"], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis = 0, subset = ["SalePrice"], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(["SalePrice"], axis = 1, inplace = True)


In [36]:
feat_num_cols = [col for col in df.columns if df[col].dtype in ["int64", "float32"]]
feat_cat_cols = [col for col in df.columns if df[col].dtype in ["object"]]

In [37]:
num_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "mean")),
        ("scaler", StandardScaler())
    ]
)


cat_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse = False, categories = "auto"))
    ]

)


preprocessor = ColumnTransformer(
    transformers = [
      ("num",num_transformer, feat_num_cols),
      ("cat", cat_transformer, feat_cat_cols)
    ]

)


pipeline = Pipeline(
    steps = [
        ("preprocessor", preprocessor)
    ]
)

In [38]:
df_pp = pipeline.fit_transform(df)

In [39]:
X_train_feat, X_valid_feat, y_train_feat, y_valid_feat = train_test_split(df_pp, y, test_size = 0.2, random_state = 42)

In [40]:
from xgboost import XGBRegressor

In [41]:
best_xgb_model = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=300)

In [42]:
best_xgb_model.fit(X_train_feat, y_train_feat)

In [43]:
best_xgb_preds = best_xgb_model.predict(X_valid_feat)

best_xgb_mse = mean_squared_error(best_xgb_preds, y_valid_feat)

best_xgb_rmse = np.sqrt(best_xgb_mse)
print(best_xgb_rmse)

0.15195014167189957


In [44]:
import joblib

# Serialize the model and preprocessing pipeline
joblib.dump(best_xgb_model, 'xgb_model.joblib')
joblib.dump(pipeline, 'pipeline.joblib')

# Load the model and preprocessing pipeline
loaded_model = joblib.load('xgb_model.joblib')
loaded_pipeline = joblib.load('pipeline.joblib')

In [45]:
df.head()

Unnamed: 0,YearBuilt,Bldg_Age,OverallQual,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,TotalSF,PoolArea,TotalFullBaths,Neighborhood,BldgType,HouseStyle,Foundation,Electrical,Heating,GarageType
0,2003,5,7,856,856,854,1710,548,2566,0,3,CollgCr,1Fam,2Story,PConc,SBrkr,GasA,Attchd
1,1976,31,6,1262,1262,0,1262,460,2524,0,2,Veenker,1Fam,1Story,CBlock,SBrkr,GasA,Attchd
2,2001,7,7,920,920,866,1786,608,2706,0,3,CollgCr,1Fam,2Story,PConc,SBrkr,GasA,Attchd
3,1915,91,7,756,961,756,1717,642,2473,0,2,Crawfor,1Fam,2Story,BrkTil,SBrkr,GasA,Detchd
4,2000,8,8,1145,1145,1053,2198,836,3343,0,3,NoRidge,1Fam,2Story,PConc,SBrkr,GasA,Attchd


In [46]:
# Assuming you have a trained model 'best_xgb_model' and a preprocessor 'pipeline'

first_row = df.iloc[0]

# Create a DataFrame with the same columns as in your training data
first_row_df = pd.DataFrame(data=[first_row])

# Preprocess the first row using your preprocessor
first_row_data = pipeline.transform(first_row_df)

# Make predictions using your model
prediction = best_xgb_model.predict(first_row_data)
prediction

array([12.269194], dtype=float32)