In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

Importing the Boston House Price Dataset

In [None]:
house_price_dataset = sklearn.datasets.fetch_california_housing()

In [None]:
print(house_price_dataset)

In [None]:
# Loading the dataset to a pandas dataframe
house_price_dataframe = pd.DataFrame(house_price_dataset.data, columns = house_price_dataset.feature_names)


In [None]:
house_price_dataframe.head()

In [None]:
# add the target column to the dataframe
house_price_dataframe['price'] = house_price_dataset.target

In [None]:
house_price_dataframe.head()

In [None]:
# checking the number of rows and columns in the dataframe
house_price_dataframe.shape

(20640, 9)

In [None]:
# check for missing values
house_price_dataframe.isnull().sum

In [None]:
# statistical measures of the dataset
house_price_dataframe.describe()

Understanding the **correlation** between various features in the dataset

1. Positive Correlation
2. Negative Correlation

In [None]:
correlation = house_price_dataframe.corr()

In [None]:
# constructing a heatmap to understand the correlation

plt.figure(figsize=(10,10))
sns.heatmap(correlation, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size':8}, cmap='Blues')

Splitting the data and target

In [None]:
X = house_price_dataframe.drop(['price'], axis=1)
Y = house_price_dataframe['price']

In [None]:
print(X,Y)

Splitting the data into training data and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(20640, 8) (16512, 8) (4128, 8)


Model Training

XGBoost Regressor

In [None]:
# load the model
model = XGBRegressor()

In [None]:
#training the model with X_train
model.fit(X_train, Y_train)

Evaluation

Prediction on training data

In [None]:
# accuracy for prediction on training data
training_data_prediction = model.predict(X_train)

In [None]:
print(training_data_prediction)

[0.6893792  2.986824   0.48874274 ... 1.8632544  1.7800125  0.7565893 ]


In [None]:
# R Squared Error
score_1 = metrics.r2_score(Y_train, training_data_prediction)

# Mean Absolute Error
score_2 = metrics.mean_absolute_error(Y_train, training_data_prediction)

print('R Sqaured Error:', score_1)
print('Mean Absolute Error:', score_2)

Visualize the actuale prices and predicted prices

In [None]:
plt.scatter(Y_train, training_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Price vs Predicted Price")
plt.show()

Prediction on test data

In [None]:
# accuracy for prediction on test data
test_data_prediction = model.predict(X_test)

In [None]:
# R Squared Error
score_1 = metrics.r2_score(Y_test, test_data_prediction)

# Mean Absolute Error
score_2 = metrics.mean_absolute_error(Y_test, test_data_prediction)

print('R Sqaured Error:', score_1)
print('Mean Absolute Error:', score_2)

R Sqaured Error: 0.8412904408180302
Mean Absolute Error: 0.30753655785801337
