### Importing libraries

In [2]:
# Import necessary libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing 
from sklearn.experimental import enable_iterative_imputer  # Explicitly enable IterativeImputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Machine learning
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from scipy.stats import uniform

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error


#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

### Loading data

In [3]:
# Read the training data from a CSV file and store it in a DataFrame
df_train = pd.read_csv("train.csv")

# Read the testing data from a CSV file and store it in a DataFrame
df_test = pd.read_csv("test.csv")

# Display the first few rows of the training DataFrame
df_train.head()

# Display the first few rows of the testing DataFrame
df_test.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [4]:
# Get the shape of the training DataFrame
df_train.shape

# Get the shape of the testing DataFrame
df_test.shape


(1459, 80)

### Handling missing values

In [5]:
# Check for missing values in the training DataFrame
df_train.isna().sum().sort_values(ascending=False)

# Repeat the same process for the testing DataFrame
df_test.isna().sum().sort_values(ascending=False)


PoolQC           1456
MiscFeature      1408
Alley            1352
Fence            1169
FireplaceQu       730
                 ... 
Electrical          0
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
SaleCondition       0
Length: 80, dtype: int64

In [6]:
# Drop the specified columns from the training DataFrame
df_train.drop(['PoolQC', 'MiscFeature','Alley', 'Fence','FireplaceQu' ], axis = 1, inplace=True)

# Display the shape of the training DataFrame after dropping the columns
df_train.shape

# Repeat the same process for the testing DataFrame
df_test.drop(['PoolQC', 'MiscFeature','Alley', 'Fence','FireplaceQu' ], axis = 1, inplace=True)

# Display the shape of the testing DataFrame after dropping the columns
df_test.shape


(1459, 75)

In [7]:
# Check for missing values in the training DataFrame
df_train.isna().sum().sort_values(ascending=False)

# Repeat the same process for the testing DataFrame
df_test.isna().sum().sort_values(ascending=False)

LotFrontage      227
GarageYrBlt       78
GarageFinish      78
GarageQual        78
GarageCond        78
                ... 
HeatingQC          0
CentralAir         0
Electrical         0
1stFlrSF           0
SaleCondition      0
Length: 75, dtype: int64

In [8]:
# Calculate the threshold for the number of NaN values that you want to drop from the datasets
threshold = len(df_train) * 0.05

# Drop the rows in the training DataFrame that have more than 'threshold' number of NaN values
df_train.dropna(thresh=threshold, inplace=True)

# Display the number of NaN values in each column of the training DataFrame, sorted in descending order
df_train.isna().sum().sort_values(ascending=False)

# Repeat the same process for the testing DataFrame
df_test.dropna(thresh=threshold, inplace=True)

# Display the number of NaN values in each column of the testing DataFrame, sorted in descending order
df_test.isna().sum().sort_values(ascending=False)


LotFrontage      213
MasVnrType        11
MasVnrArea        10
BsmtCond           3
BsmtQual           2
                ... 
BsmtUnfSF          0
MSSubClass         0
Heating            0
HeatingQC          0
SaleCondition      0
Length: 75, dtype: int64

In [9]:
# Display the shape of the training DataFrame again
df_train.shape

# Display the shape of the testing DataFrame again
df_test.shape


(1342, 75)

In [10]:
# Display the column names of the training DataFrame
df_train.columns

# Display the column names of the testing DataFrame
df_test.columns


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPo

In [11]:
# Display a concise summary of the training DataFrame
df_train.info()

# Display a concise summary of the testing DataFrame
df_test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1349 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1349 non-null   int64  
 1   MSSubClass     1349 non-null   int64  
 2   MSZoning       1349 non-null   object 
 3   LotFrontage    1103 non-null   float64
 4   LotArea        1349 non-null   int64  
 5   Street         1349 non-null   object 
 6   LotShape       1349 non-null   object 
 7   LandContour    1349 non-null   object 
 8   Utilities      1349 non-null   object 
 9   LotConfig      1349 non-null   object 
 10  LandSlope      1349 non-null   object 
 11  Neighborhood   1349 non-null   object 
 12  Condition1     1349 non-null   object 
 13  Condition2     1349 non-null   object 
 14  BldgType       1349 non-null   object 
 15  HouseStyle     1349 non-null   object 
 16  OverallQual    1349 non-null   int64  
 17  OverallCond    1349 non-null   int64  
 18  YearBuil

### Impute missing values

In [12]:
# Identify the columns in the training DataFrame that have any missing values
missing_values = df_train.columns[df_train.isna().any()].tolist()

# Display the list of columns in the training DataFrame that have missing values
missing_values

# Repeat the same process for the testing DataFrame
missing_test = df_test.columns[df_test.isna().any()].tolist()

# Display the list of columns in the testing DataFrame that have missing values
missing_test

['MSZoning',
 'LotFrontage',
 'Utilities',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'KitchenQual',
 'Functional',
 'SaleType']

In [13]:
# Loop over each column in the list of columns with missing values in the training DataFrame
for col in missing_values:
    # If the column's data type is not 'object', it's a numerical column
    if df_train[col].dtype != 'object':
        # Create an IterativeImputer instance with a RandomForestRegressor as the estimator
        # IterativeImputer models each feature with missing values as a function of other features, and uses that estimate for imputation
        imputer_num = IterativeImputer(estimator=RandomForestRegressor())
        # Fit the imputer on the column and transform the column
        df_train[col] = imputer_num.fit_transform(df_train[[col]])
    else:  # If the column's data type is 'object', it's a categorical column
        # Create a SimpleImputer instance with 'most_frequent' as the strategy
        imputer_object = SimpleImputer(strategy='most_frequent')
        # Fit the imputer on the column and transform the column
        df_train[col] = imputer_object.fit_transform(df_train[[col]]).squeeze()

# Repeat the same process for the testing DataFrame
for col in missing_test:
    if df_test[col].dtype != 'object':  # Numerical column
        imputer_num = IterativeImputer(estimator=RandomForestRegressor())
        df_test[col] = imputer_num.fit_transform(df_test[[col]])
    else:  # Categorical column
        imputer_object = SimpleImputer(strategy='most_frequent')
        df_test[col] = imputer_object.fit_transform(df_test[[col]]).squeeze()


In [14]:
# Display the number of NaN values in each column of the training DataFrame, sorted in descending order
df_train.isna().sum().sort_values(ascending=False)

# Repeat the same process for the testing DataFrame
df_test.isna().sum().sort_values(ascending=False)


Id               0
GarageType       0
Functional       0
TotRmsAbvGrd     0
KitchenQual      0
                ..
MasVnrType       0
Exterior2nd      0
Exterior1st      0
RoofMatl         0
SaleCondition    0
Length: 75, dtype: int64

In [15]:
# Prepare the training data
X_train = df_train.drop(['Id', 'MiscVal', 'SalePrice'], axis=1)

# Create the target matrix y_train for the training data
y_train = df_train[['MiscVal', 'SalePrice']]

# Prepare the testing data
X_test = df_test.drop(['Id', 'MiscVal'], axis=1)

# Create the target matrix y_test for the testing data
y_test = df_test[['MiscVal']]


In [16]:
# Identify the categorical columns in the training data
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

# Define a transformer for the categorical columns using OneHotEncoder
# OneHotEncoder creates binary columns from categorical columns
# 'handle_unknown='ignore'' tells the transformer to ignore (rather than throw an error for) any value that was not seen during fit
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Define a column transformer to apply the OneHotEncoder to the categorical columns
# ColumnTransformer applies transformers to columns of an array or pandas DataFrame
# 'transformers' is a list of tuples where each tuple contains a name, a transformer, and a list of names (or indices) of columns to be transformed by the transformer
# 'remainder='passthrough'' tells the transformer to let the remaining columns that were not specified in 'transformers' pass through without transformation
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ], 
    remainder='passthrough'
)


In [17]:
# Define a scaler using StandardScaler
scaler = StandardScaler(with_mean=False)

# Create a pipeline with a scaler and a Ridge regressor
# The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters
pipeline = Pipeline(steps=[('scaler', scaler),
                           ('ridge', Ridge())])

# Define the parameter distribution for alpha
# 'uniform()' generates a uniform continuous random variable
param_distribution = {'ridge__alpha': uniform()}

# Create the RandomizedSearchCV object
# RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values
rand_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_distribution, n_iter=100, cv=5, random_state=0)


In [18]:
# Split the data into training and validation sets
X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit the preprocessor to X_train
X_train_preprocessed = preprocessor.fit_transform(X_train_r)

In [19]:
# Fit the RandomizedSearchCV object to the training data
rand_search.fit(X_train_preprocessed, y_train_r)

# Print the best parameters
print(rand_search.best_params_)

{'ridge__alpha': 0.9883738380592262}


### Predicting the test data

In [20]:
# Preprocess the test data using the fitted preprocessor
X_test_preprocessed = preprocessor.transform(X_test)

# Scale the preprocessed test data using the fitted scaler
X_test_scaled = rand_search.best_estimator_['scaler'].transform(X_test_preprocessed)

# Make predictions on the scaled test data
test_pred = rand_search.predict(X_test_scaled)

# Display test_pred
test_pred

array([[-16286.53373932, 170916.8188008 ],
       [-14389.11625316, 192080.96092391],
       [-14917.0433006 , 200013.43674681],
       ...,
       [ -4774.54974893, 164928.15331176],
       [-14923.81827307, 170091.32453127],
       [-15073.04900361, 153286.92188263]])

### Debugging
Completely ignore this settion.  
It contains troubleshooting information

In [21]:
# Print shapes of original datasets
print("Shape of X_train:", X_train_r.shape)
print("Shape of X_test:", X_test.shape)

# Fit the preprocessor to the training data
X_train_preprocessed = preprocessor.fit_transform(X_train_r)

# Preprocess the new test data using the fitted preprocessor
X_test_preprocessed = preprocessor.transform(X_test)

# Print shapes of preprocessed datasets
print("Shape of X_train_preprocessed:", X_train_preprocessed.shape)
print("Shape of X_test_preprocessed:", X_test_preprocessed.shape)


Shape of X_train: (1079, 73)
Shape of X_test: (1342, 73)


Shape of X_train_preprocessed: (1079, 258)
Shape of X_test_preprocessed: (1342, 258)


In [22]:
# Get the number of features expected by the StandardScaler
num_features_expected = rand_search.best_estimator_['scaler'].mean_.shape[0]

# Print the number of features expected by the scaler and the actual number of features in the preprocessed datasets
print("Number of features expected by the scaler:", num_features_expected)
print("Number of features in X_train_preprocessed:", X_train_preprocessed.shape[1])
print("Number of features in X_test_preprocessed:", X_test_preprocessed.shape[1])


Number of features expected by the scaler: 258
Number of features in X_train_preprocessed: 258
Number of features in X_test_preprocessed: 258


### Model Evaluation

In [25]:
# Transform the validation data using the fitted preprocessor
X_val = preprocessor.transform(X_val_r)

# Scale the transformed validation data using the fitted scaler
# 'best_estimator_' gives the best estimator found by grid search
X_val = rand_search.best_estimator_['scaler'].transform(X_val)

# Make predictions on the scaled validation data using the fitted model
y_pred = rand_search.predict(X_val)

# Display the predictions
y_pred


array([[  -14650.45676544,   244191.9897775 ],
       [  -14937.02801214,   212600.99156452],
       [  -14975.28112909,   642158.26166604],
       [  -14709.52058237,   275554.77298069],
       [  -14839.64349941,   261911.37364056],
       [  -13582.25210463,   261914.68308932],
       [  -13562.70983349,    28060.34098613],
       [  -13211.75783216,   217537.38731859],
       [  -13562.72110964,   483412.85627391],
       [  -14496.62030862,   169223.66058216],
       [  -14365.68223063,   234081.64935507],
       [  -14109.42699175,   253211.64236585],
       [  -13656.67428488,    57676.88923697],
       [  -14348.74196488,   267781.05755851],
       [  -15237.86641954,   456985.3875325 ],
       [  -14164.26665611,   564909.06286739],
       [  -15488.84106362,   279922.47870106],
       [  -15348.05329197,   359666.3525318 ],
       [  -13975.90976455,    63245.28886288],
       [  -14030.37134834,   237716.98683588],
       [  -14950.67662095,   266980.98326412],
       [   10

## Model Evaluation Notes

In [27]:
# Calculate the score of the fitted model on the validation data
model_score = rand_search.score(X_val, y_pred)
model_score

1.0

* 'score()' returns the coefficient of determination R^2 of the prediction  
The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()  
The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse)

In [28]:
# Calculate the Mean Absolute Percentage Error (MPE)
mpe = mean_absolute_percentage_error(y_val_r, y_pred)
print(f'The mpe is : {mpe}')

# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val_r, y_pred)
print(f'The mae is : {mae}')

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_val_r, y_pred)
print(f'The mse is : {mse}')

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'The rmse is : {rmse}')

# Calculate the Coefficient of Determination, also known as R-squared
r_squared = r2_score(y_val_r, y_pred)
print(f'The r_squared is : {r_squared}')


The mpe is : 3.171360214612529e+19
The mae is : 69596.64442814031
The mse is : 21074552104.18326
The rmse is : 145170.76876624738
The r_squared is : -3831.197827565835


#### MPE
* Mean Percentage Error (MPE): This is the average percentage difference between the actual and predicted values. An MPE of 3.171360214612529e+19 is extremely high, which suggests that the model’s predictions are far off from the actual values.
#### MAE
* Mean Absolute Error (MAE): This is the average of the absolute differences between the predicted and actual values. It measures the average magnitude of the errors in a set of predictions, without considering their direction. A MAE of 69596.64442814031 means that, on average, the model’s predictions are about 69596.644 units away from the actual values.
#### MSE
* Mean Squared Error (MSE): This is the average of the squared differences between the predicted and actual values. It places more weight on large errors because they are squared before they are averaged. An MSE of 21074552104.18326 suggests that the model’s predictions are often far from the actual values.
### RMSE
* Root Mean Squared Error (RMSE): This is the square root of the MSE. It measures the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are. An RMSE of 145170.76876624738 means that, on average, the model’s predictions are about 145170.768 units away from the actual values.
#### R-squared
* R-squared (Coefficient of Determination): This is a statistical measure that represents the proportion of the variance for a dependent variable that’s explained by an independent variable or variables in a regression model. An R-squared of -3831.197827565835 is far less than zero, which is unusual and indicates that the model’s predictions are worse than if it had simply predicted the mean of the target variable.

## Summary
This model's(Ridge) performance is very poor, I will have to explore a different modeling approach.

# XGBoost

In [40]:
# Import the necessary libraries
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define a transformer for categorical columns using OneHotEncoder
#categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
#categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Define a column transformer to apply the transformer to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ], 
    remainder='passthrough'  # Retain numerical columns
)

# Fit the preprocessor to X_train
X_trainxg_preprocessed = preprocessor.fit_transform(X_train)

X_train_xg, X_val_xg, y_train_xg, y_val_xg = train_test_split(X_trainxg_preprocessed, y_train, test_size=0.2, random_state=42)

# Define a scaler using StandardScaler
scaler = StandardScaler(with_mean=False)

# Create a pipeline with a scaler and an XGBoost regressor
pipeline = Pipeline(steps=[('scaler', scaler),
                            ('xgb', XGBRegressor(objective='reg:squarederror'))])

# Define the parameter distribution for the XGBoost regressor
param_distribution = {
    'xgb__n_estimators': range(50, 400, 50),
    'xgb__learning_rate': uniform(0.01, 0.6),
    'xgb__max_depth': range(3, 10),
    'xgb__colsample_bytree': uniform(0.5, 0.5),
    'xgb__gamma': uniform(0, 2)
}

# Create the RandomizedSearchCV object
rand_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_distribution, n_iter=100, cv=5, random_state=0)


# Fit the RandomizedSearchCV object to the data
rand_search.fit(X_train_xg, y_train_xg)

# Transform the validation data using the fitted preprocessor
#X_val_processed = preprocessor.transform(X_val_xg)

# Make predictions on the validation data using the fitted model
y_pred_xg = rand_search.predict(X_val_xg)
y_pred_xg

array([[ 1.01725761e+02,  2.65971812e+05],
       [ 1.23821541e+02,  1.84176750e+05],
       [-1.57534304e+01,  3.77712844e+05],
       [-5.60742712e+00,  3.00321125e+05],
       [ 2.43706882e-01,  2.52804922e+05],
       [ 5.88478355e+01,  1.21783320e+05],
       [ 3.17381744e+01,  1.43281766e+05],
       [ 2.36904335e+01,  3.03812875e+05],
       [ 3.93286819e+01,  4.01921031e+05],
       [-3.15685797e+00,  1.87250391e+05],
       [ 3.68099642e+00,  1.54861141e+05],
       [-3.68116593e+00,  2.11009984e+05],
       [ 6.59461308e+00,  2.28385406e+05],
       [ 1.04912262e+01,  1.81741688e+05],
       [ 1.05303133e+00,  2.38069875e+05],
       [ 5.12731314e+00,  2.52065078e+05],
       [-2.52732038e+00,  2.69355500e+05],
       [-1.04870701e+01,  1.48929375e+05],
       [-1.49939613e+01,  1.12696438e+05],
       [-3.19458932e-01,  9.48907344e+04],
       [ 1.37314873e+01,  2.36033297e+05],
       [ 1.20188309e+02,  8.13154375e+04],
       [-7.56021738e+00,  1.65210922e+05],
       [ 5.

In [41]:
# Preprocess the test data using the fitted preprocessor
X_testxg_preprocessed = preprocessor.transform(X_test)

# Scale the preprocessed test data using the fitted scaler
X_testxg_scaled = rand_search.best_estimator_['scaler'].transform(X_testxg_preprocessed)

# Make predictions on the scaled test data
test_pred = rand_search.predict(X_testxg_scaled)

# Display test_pred
test_pred

array([[  408.876  , 85654.414  ],
       [ 8431.126  , 90496.42   ],
       [  319.21835, 81461.695  ],
       ...,
       [  163.25017, 90840.     ],
       [  350.46085, 91338.336  ],
       [ 8492.4375 , 85160.555  ]], dtype=float32)

In [42]:
# Calculate the score of the fitted model on the validation data
model_score_xg = rand_search.score(X_val_xg, y_pred_xg)
model_score_xg

1.0

In [43]:
# Calculate the Mean Absolute Percentage Error (MPE)
mpe = mean_absolute_percentage_error(y_val_xg, y_pred_xg)
print(f'The mpe is : {mpe}')

# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val_xg, y_pred_xg)
print(f'The mae is : {mae}')

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_val_xg, y_pred_xg)
print(f'The mse is : {mse}')

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'The rmse is : {rmse}')

# Calculate the Coefficient of Determination, also known as R-squared
r_squared = r2_score(y_val_xg, y_pred_xg)
print(f'The r_squared is : {r_squared}')


The mpe is : 3.109481328995108e+17
The mae is : 9110.296850086175
The mse is : 349738769.72191846
The rmse is : 18701.303957797125
The r_squared is : -12.357070740715217


### XGBoost Model Evaluation
* Mean Absolute Percentage Error (MPE): The MPE is extremely high (around 3.11×1017). This suggests that the model’s predictions are, on average, off by a huge percentage. This is not a good sign and indicates that the model is not performing well.


* Mean Absolute Error (MAE): The MAE is around 9110.3. This means that on average, the model’s predictions are about $9110 off from the actual house prices. Depending on the range of house prices in your dataset, this might be acceptable or too high.


* Mean Squared Error (MSE): The MSE is approximately 3.50×108. The MSE is sensitive to outliers because it squares the errors before averaging them. If the house prices in your dataset have a large range or a few extreme values, this might explain the high MSE.


* Root Mean Squared Error (RMSE): The RMSE is around 18701.3, which is higher than the MAE. This is expected as RMSE gives a relatively high weight to large errors.


* R-squared: The R-squared value is -12.36, which is far less than 0. This indicates that the model is performing worse than a model that would always predict the mean of the target variable. An R-squared value of 1 indicates a perfect fit, and values less than 0 indicate that the model is not fitting the data well.

## Summary
In summary, these metrics suggest that the model is not performing well on the house price prediction. Although, it's performance is a lot better than the Ridge model, however I believe it can be improved upon. I will have to revisit this model’s(xgboost) hyperparameters or consider feature engineering(dropping some features), or as a last resort try a different modeling approach.

# Data Submission

In [61]:
# Create a DataFrame 'submission' with 'MiscVal' and 'SalePrice' as columns.
# 'test_pred[:, 0]' is the predicted values for 'MiscVal'
# 'test_pred[:, 1]' is the predicted values for 'SalePrice'
submission = pd.DataFrame({'MiscVal': test_pred[:, 0], 'SalePrice': test_pred[:, 1]})

# Set the index of the DataFrame to start from 1
submission.index = np.arange(1, len(submission) + 1)

# Write the DataFrame to a CSV file named 'submission.csv'
submission.to_csv('submission.csv', index=True)


In [62]:
submission.head()

Unnamed: 0,MiscVal,SalePrice
1,408.876007,85654.414062
2,8431.125977,90496.421875
3,319.218353,81461.695312
4,8461.058594,83105.890625
5,152.790207,86868.90625
