In [1]:
import pandas as pd 
import numpy as np


# Import linear_model module from scikit-learn for various linear regression and classification algorithms
from sklearn import linear_model 

# Import train_test_split function from scikit-learn to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split


#### loading the California housing data set from sklearn.datasets 

In [2]:
from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing()

print(f"description : {california_housing.DESCR}")  # Description of the dataset


description : .. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using 

#### Transform the data into data frame

In [3]:
# Create a DataFrame 'df_x' from the features of the California Housing dataset.
# Assign column names to 'df_x' based on the feature names provided in the dataset.
df_x = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

# Create a DataFrame 'df_y' from the target variable of the California Housing dataset.
# This DataFrame contains the median house values for each block group.
df_y = pd.DataFrame(california_housing.target)


In [4]:
""" describe() method in Pandas is a very useful tool for getting a quick overview of the statistical characteristics of each column (feature) in a DataFrame.
 """
 
df_x.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


#### Linear regression model

In [5]:
reg = linear_model.LinearRegression() # choosing linear regression model for our model and creating an instance of a regression model

#### Split the data into 67% training and 33% training

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df_x,df_y, test_size=0.33, random_state = 42)



####  Train the model

In [7]:
""" When you call reg.fit(x_train, y_train), the linear regression model analyzes the training data (x_train and y_train). It calculates the coefficients for each feature in x_train that best predict the target y_train. """

reg.fit(x_train, y_train)

#### print the coefficients/weights for each feature/coloumn of the model

In [8]:
print(reg.coef_)

[[ 4.44870466e-01  9.55004561e-03 -1.21991503e-01  7.79144696e-01
  -7.68990808e-08 -3.29948505e-03 -4.19131153e-01 -4.34103468e-01]]


In [9]:
df_x.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [10]:
#### Print the prediction 

In [11]:
y_pred = reg.predict(x_test)
print(y_pred)

[[0.72563462]
 [1.76650223]
 [2.70545812]
 ...
 [1.25803135]
 [1.66673014]
 [2.25826279]]


In [12]:
print(y_test) # print the actual values

             0
20046  0.47700
3024   0.45800
15663  5.00001
20484  2.18600
9814   2.78000
...        ...
15316  1.66100
14772  0.93600
12870  1.07000
13476  1.30700
16123  3.07100

[6812 rows x 1 columns]


#### using MSE to check the model performance and accuracy

In [13]:
print(np.mean ((y_pred - y_test)**2))

0.5369686543372468


#### using MSE and sklearn.metrics to check the model performance and accuracy

In [14]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test,y_pred)
print("MSE: ", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

MSE:  0.5369686543372468
RMSE: 0.732781450595774
