In [11]:
# Create a machine learning model to predict the price of a house
# use california housing dataset
# use lasso regression
# use grid search to find the best hyperparameters
# use cross validation to evaluate the model
# use the best model to make predictions on the test set
# use the best model to make predictions on new data

# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

In [2]:
# load the dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing.keys()
housing.data.shape
housing.target.shape
housing.feature_names
housing.DESCR


'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 20640\n\n    :Number of Attributes: 8 numeric, predictive attributes and the target\n\n    :Attribute Information:\n        - MedInc        median income in block group\n        - HouseAge      median house age in block group\n        - AveRooms      average number of rooms per household\n        - AveBedrms     average number of bedrooms per household\n        - Population    block group population\n        - AveOccup      average number of household members\n        - Latitude      block group latitude\n        - Longitude     block group longitude\n\n    :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000

In [3]:
# create a pandas dataframe
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df.head()
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [4]:
# use lasso regression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, random_state=42)

# create the model
lasso_reg = Lasso()

# train the model
lasso_reg.fit(X_train, y_train)

# make predictions
y_pred = lasso_reg.predict(X_test)

# evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse


0.9723474230582032

In [5]:
# use grid search to find the best hyperparameters
from sklearn.model_selection import GridSearchCV

# create a list of hyperparameters
param_grid = [
    {'alpha': [0.1, 1, 10, 100, 1000], 'max_iter': [1000, 2000, 3000, 4000, 5000]}
]

# create the model
lasso_reg = Lasso()

# create the grid search
grid_search = GridSearchCV(lasso_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

# train the model
grid_search.fit(housing.data, housing.target)

# get the best hyperparameters
grid_search.best_params_

# get the best estimator
# grid_search.best_estimator_

# # get the evaluation scores
# cvres = grid_search.cv_results_

# # print the evaluation scores
# for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
#     print(np.sqrt(-mean_score), params)


{'alpha': 0.1, 'max_iter': 1000}

In [6]:
# use cross validation to evaluate the model
from sklearn.model_selection import cross_val_score

# create the model
lasso_reg = Lasso(alpha=0.1, max_iter=1000)

# train the model
lasso_reg.fit(X_train, y_train)

# make predictions
y_pred = lasso_reg.predict(X_test)

# evaluate the model
scores = cross_val_score(lasso_reg, housing.data, housing.target, scoring='neg_mean_squared_error', cv=10)
lasso_rmse_scores = np.sqrt(-scores)

# print the evaluation scores
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())

display_scores(lasso_rmse_scores)

Scores: [0.60521087 0.78707991 0.9996487  0.60785933 0.9315006  0.72950324
 0.68128935 0.96965618 0.98115257 0.63653158]
Mean: 0.7929432343485896
Standard deviation: 0.15461733636407524


In [7]:
final_model = grid_search.best_estimator_

# use the best model to make predictions on new data
new_data = [
    [   4.2143    ,   37.        ,    5.28823529,    0.97352941,
        860.        ,    2.52941176,   33.81      , -118.12      ]
]

# make predictions
final_model.predict(new_data)

# save the model
import joblib
joblib.dump(final_model, 'final_model.pkl')

# load the model
final_model = joblib.load('final_model.pkl')

In [8]:
final_model.predict(new_data)

array([2.38500538])