Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import sklearn.datasets
import requests
import json
from interpret.blackbox import ShapKernel
from interpret import show
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


Importing the california house pricing dataset 

In [None]:
house_price_dataset = sklearn.datasets.fetch_california_housing()

In [None]:
#Loading the dataset into the pandas Dataframe
house_price_dataframe = pd.DataFrame(house_price_dataset.data, columns=house_price_dataset.feature_names)

In [None]:
#Print the first 5 rows of the dataframe
house_price_dataframe.head()

In [None]:
#add the target (price) column to the dataframe
house_price_dataframe['price'] = house_price_dataset.target

In [None]:
#Print the first 5 rows of the dataframe
house_price_dataframe.head()

In [None]:
#checking the number of rows and columns in the dataframe
house_price_dataframe.shape

In [None]:
house_price_dataframe.describe()

In [None]:
house_price_dataframe.isnull().sum()

Understand the correlation between the various features in the dataset

1. Positive Correlation
2. Negative Correlation

In [None]:
correlation = house_price_dataframe.corr()

In [None]:
#Constructing a heatmap to understand the correlation between the columns
plt.figure(figsize=(10,10))
sns.heatmap(correlation, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size':8}, cmap='Greens')

Splitting the data and the target

In [None]:
X = house_price_dataframe.drop(['price'], axis=1)
Y = house_price_dataframe['price']

In [None]:
print(X)
print(Y)

Split the data into training data and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)


In [None]:
print(X.shape, X_train.shape, X_test.shape)

Model Training ::: 
XGBoost Regressor model for training

Training the model

In [None]:
def train(xgbModel,X_train, Y_train):
    xgbModel = xgbModel.fit(X_train, Y_train)
    
    train_acc = xgbModel.score(X_train, Y_train)
    mlflow.log_metric('train_accuracy', train_acc)
    
    
    print(f'Train Accuracy: {train_acc:.3%}')

In [None]:
def evaluate(xgbModel, X_test, Y_test):
    preds = xgbModel.predict(X_test)

    # Calculate regression metrics
    mse = mean_squared_error(Y_test, preds)
    mae = mean_absolute_error(Y_test, preds)
    r2 = r2_score(Y_test, preds)

    # Log metrics
    mlflow.log_metric('mean_squared_error', mse)
    mlflow.log_metric('mean_absolute_error', mae)
    mlflow.log_metric('r2_score', r2)

    print(f'Mean Squared Error: {mse:.3f}')
    print(f'Mean Absolute Error: {mae:.3f}')
    print(f'R squared: {r2:.3f}')

In [None]:
#loading the model
xgbModel = XGBRegressor()

mlflow.set_experiment('House_prediction_XGBRegressor')

with mlflow.start_run():
    train(xgbModel, X_train, Y_train)
    evaluate(xgbModel, X_test,Y_test)
    
    mlflow.sklearn.log_model(xgbModel,'XGBRegressor_model')
    print('Model run: ',mlflow.active_run().info.run_uuid)
mlflow.end_run()



Feature importance using ShapKernel

In [None]:
explainer = ShapKernel(predict_fn=xgbModel.predict, data=X_train, model=xgbModel)
# You may use a subset of your data for faster computation
X_explain = X_train.sample(100)  # Adjust the sample size as needed

# Generate explanations
shap_values = explainer.explain_local(X_explain)


In [None]:
show(shap_values)


Evaluation :: Prediction on training data

In [None]:
#accuracy for prediction on training data
training_data_prediction = xgbModel.predict(X_train)

Prediction on test data

In [None]:
#accuracy for prediction on test data
test_data_prediction = xgbModel.predict(X_test)

Visualize the actual prices and predicted prices

In [None]:
plt.scatter(Y_train, training_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Prices vs Predicted Prices")
plt.show()

#Data points that are close to eachother indicate that the price predicted is very close to the price in the dataset

Loading a model

In [None]:
loaded_model = mlflow.sklearn.load_model("runs:/ea23496ff8f24006bed32755dbf5e78a/XGBRegressor_model")

In [None]:
loaded_model.score(X_test,Y_test)

Querying a loaded model

In [None]:
# Define the feature names as they were used in training
feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

# Create a new observation with fictitious values for each feature
new_observation = [[8, 41, 6, 1, 322, 2.5, 37.88, -122.23]]

# Create a DataFrame for the new observation
input_data = pd.DataFrame(new_observation, columns=feature_names)

# Now you can use the loaded_model to predict this new observation
prediction = loaded_model.predict(input_data)
print("Predicted value:", prediction)


Serving a model locally and querying

In [None]:


url = 'http://127.0.0.1:4000/invocations'
headers = {'Content-Type': 'application/json'}

data = {
    "dataframe_split": {
        "columns": ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"],
        "data": [[8.3252, 41.0, 6.984127, 1.02381, 322.0, 2.555556, 37.88, -122.23]]
    }
}

response = requests.post(url, headers=headers, data=json.dumps(data))

if response.status_code == 200:
    predicted_value = response.json()['predictions'][0]
    print('Predicted value: ', predicted_value)
else:
    print("Failed to fetch response:", response.status_code, response.text)


Querying a model deployed to GCP

In [None]:


url = 'http://34.70.25.140:5000/invocations'
headers = {'Content-Type': 'application/json'}

data = {
    "dataframe_split": {
        "columns": ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"],
        "data": [[4.3252, 31.0, 6.984127, 1.02381, 322.0, 2.555556, 37.88, -122.23]]
    }
}

response = requests.post(url, headers=headers, data=json.dumps(data))

if response.status_code == 200:
    predicted_value = response.json()['predictions'][0]
    print('Predicted value: ', predicted_value)
else:
    print("Failed to fetch response:", response.status_code, response.text)


Querying model running on Docker

mlflow models build-docker --model-uri runs:/RUN ID/ModelName -n house-price-pred --enable-mlserver

docker run -p 4000:8080 house-price-pred

In [None]:


url = 'http://0.0.0.0:4000/invocations'
headers = {'Content-Type': 'application/json'}

data = {
    "dataframe_split": {
        "columns": ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"],
        "data": [[4.3252, 31.0, 6.984127, 1.02381, 322.0, 2.555556, 37.88, -122.23]]
    }
}

response = requests.post(url, headers=headers, data=json.dumps(data))

if response.status_code == 200:
    predicted_value = response.json()['predictions'][0]
    print('Predicted value: ', predicted_value)
else:
    print("Failed to fetch response:", response.status_code, response.text)
