In [1]:
import numpy as np
import pandas as pd

# Problem Understanding

Your Real Estate partner in California needs your help with pricing homes at the optimal level<br>

Help them to predict the expected sale value of properties in their State and you will get slice of their additional sales commission 💸

# Data Understanding

In [2]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Data preparation

### Split your X data in train and test datasets
Here is the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

Always fit in the training data set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)



In [15]:
print("My X_train shape is: ", X_train.shape)
print("My X_test shape is: ", X_test.shape)
print("My training dependent variable is: ", y_train.shape)
print("My test dependent variable is: ", y_test.shape)

My X_train shape is:  (16512, 8)
My X_test shape is:  (4128, 8)
My training dependent variable is:  (16512,)
My test dependent variable is:  (4128,)


### Split your train data in train and validation datasets

Now we are going to do some feature engineering. The validation data set measures the performance of the models which allows us to do comparisons across the models. This include linear regression and lasso. Two learners allow comparison of two models. You use this data set because TEST DATA SET IS ONLY CHECKED ONCE

In [16]:
from sklearn.model_selection import train_test_split
# We are creating a new data set because test data set is checked once

X_train_final, X_test_validation, y_train_final, y_test_validation = train_test_split(X,y, test_size = 1000, 
                                                                                           random_state = 51)

print("My final training set is: ", X_train_final.shape)
print("My validation set is: ", X_test_validation.shape)
print("My training dependent variable is: ", y_train_final.shape)
print("My test dependent variable is: ", y_test_validation.shape)






My final training set is:  (19640, 8)
My validation set is:  (1000, 8)
My training dependent variable is:  (19640,)
My test dependent variable is:  (1000,)


Now we must do the scaling

### Scale the 3 datasets using StandardScaler

In [19]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()

# We fit only on train data set and transform on all data sets
scaler.fit(X_train_final)

# Perform scaling for each dataset
X_train_final_scaled = scaler.transform(X_train_final)
X_test_validation_scaled = scaler.transform(X_test_validation)
X_test_scaled = scaler.transform(X_test)

# Bring the data back into a DataFrame
X_train_scaled = pd.DataFrame(X_train_final_scaled, columns=data['feature_names'])
X_validation_scaled = pd.DataFrame(X_test_validation_scaled, columns=data['feature_names'])
X_test_scaled = pd.DataFrame(X_test_scaled, columns=data['feature_names'])

# Modelling and Model Evaluation

### Train a linear regression model

In [20]:
from sklearn.linear_model import LinearRegression
# Setting up the learner
lin_reg = LinearRegression()

# Actually learning
#This is fit on the TRAINING AND SCALED DATA
lin_reg.fit(X_train_scaled, y_train_final) 

#R-squared
initial_score = lin_reg.score(X_train_final_scaled, y_train_final)
print("The initial R-squared value for the linear model is: ", initial_score.round(2))

# Print the coefficient of the linear regression
parameters = lin_reg.coef_ 

print("The median income has a coefficient of: ", parameters[0].round(2))
print("The average room has a coefficient of: ", parameters[1].round(2))

# and intercept
inter = lin_reg.intercept_
print("The intercept of the regression is", inter.round(2))



The initial R-squared value for the linear model is:  0.6
The median income has a coefficient of:  0.83
The average room has a coefficient of:  0.12
The intercept of the regression is 2.06


### Measure the R-squared, MSE and MAE of your model
Here is the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [24]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

initial_score = lin_reg.score(X_train_scaled, y_train_final)
print("The Initial R2 value for the linear model is: ", initial_score.round(2))

#Predicting the values for the model
X_train_scaled_y_pred = lin_reg.predict(X_train_scaled)

#MSE
mse = mean_squared_error (y_train_final, X_train_scaled_y_pred)
print("The Mean Squared Error for this model is: ", mse.round(2))

#MAE
mae = mean_absolute_error (y_train_final, X_train_scaled_y_pred)
print("The Mean Absolute Error for this model is: ", mae.round(2))

The Initial R2 value for the linear model is:  0.6
The Mean Squared Error for this model is:  0.53
The Mean Absolute Error for this model is:  0.53


### Train a LASSO model

In [23]:
from sklearn.linear_model import Lasso

lasso = Lasso(.30)
lasso.fit(X_train_scaled, y_train_final)
lasso.score(X_train_scaled,y_train_final)

0.40411814446198013

### Measure the R-squared, MSE and MAE of your model

In [26]:
initial_score_1 = lasso.score(X_train_scaled, y_train_final)
print("The Initial R-Squared value for the linear model is: ", initial_score_1.round(2))

X_train_scaled_y_pred_lasso = lasso.predict(X_train_scaled)

mse = mean_squared_error(y_train_final, X_train_scaled_y_pred_lasso)
print("The mse for this model is: ", mse.round(2))

mae = mean_absolute_error(y_train_final, X_train_scaled_y_pred_lasso)
print(" The mae for this model is: ", mae.round(2)) 



The Initial R-Squared value for the linear model is:  0.4
The mse for this model is:  0.79
 The mae for this model is:  0.69


## Double Check predicted values

In [27]:
print(X_train_scaled_y_pred_lasso)
print(X_train_scaled_y_pred)

[1.83049098 3.06381423 1.51770607 ... 2.55035529 1.54524753 1.92540788]
[1.14074975 3.12203254 0.8348561  ... 3.2768874  1.02381601 1.78280422]


# Interprete your winning model

### What can you tell your business partner by looking at the coefficients?

In [None]:
# 
# 
# 