In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import r2_score

In [3]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
# housing

In [5]:
lines = (housing.DESCR).split('\n')
for line in lines:
    print(line)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [6]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)

In [7]:
# The target is to predict the Median House Value from the 8 features collected
X = housing.data
y = housing.target
len(y)

20640

In [8]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Linear Regression Model

In [10]:
reg = LinearRegression()

In [11]:
reg.fit(X_train, y_train)

LinearRegression()

In [12]:
pred_linReg = reg.predict(X_test)

In [13]:
r2_linReg = r2_score(y_test, pred_linReg)
r2_linReg

0.6143676318619615

In [14]:
linReg = {'r2_linReg': r2_linReg}

## Lasso Models L1 Regularization

In [15]:
reg_lso = Lasso()

In [16]:
reg_lso.fit(X_train, y_train)

Lasso()

In [17]:
pred_lso = reg_lso.predict(X_test)

In [18]:
r2_lasso = r2_score(y_test, pred_lso)
r2_lasso

0.2859427748009815

In [19]:
lasso = {'r2_lasso': r2_lasso}

### Reduce the amount of L1 Regularization

In [20]:
reg_lso_2 = Lasso(alpha=0.5)

In [21]:
reg_lso_2.fit(X_train, y_train)

Lasso(alpha=0.5)

In [22]:
pred_lso_2 = reg_lso_2.predict(X_test)

In [23]:
r2_lasso_2 = r2_score(y_test, pred_lso_2)
r2_lasso_2

0.4628431806113248

In [24]:
lasso_2 = {'r2_lasso_2': r2_lasso_2}

## Ridge Model L2 Regularization

In [25]:
reg_ridge = Ridge()

In [26]:
reg_ridge.fit(X_train, y_train)

Ridge()

In [27]:
pred_ridge = reg_ridge.predict(X_test)

In [28]:
r2_ridge = r2_score(y_test, pred_ridge)
r2_ridge

0.6143690521007702

In [29]:
ridge = {'r2_ridge': r2_ridge}

### Reduced Ridge Regression

In [30]:
reg_ridge_2 = Ridge(alpha=0.5)

In [31]:
reg_ridge_2.fit(X_train, y_train)

Ridge(alpha=0.5)

In [32]:
pred_ridge_2 = reg_ridge_2.predict(X_test)

In [33]:
r2_ridge_2 = r2_score(y_test, pred_ridge_2)
r2_ridge_2

0.6143683520722076

In [34]:
ridge_2 = {'r2_ridge_2': r2_ridge_2}

## Compare non-linear model: Random Forest

In [35]:
reg_rf = RandomForestRegressor()

In [36]:
reg_rf.fit(X_train, y_train)
pred_reg_rf = reg_rf.predict(X_test)
r2_reg_rf = r2_score(y_test, pred_reg_rf)
r2_reg_rf

0.8147080268335121

In [37]:
r2_rf = {'r2_reg_rf': r2_reg_rf}

## Compare the various models

In [38]:
obj = [linReg, lasso, lasso_2, ridge, ridge_2, r2_rf]

In [39]:
o2 = {}
names = []
for d in obj:
    for k,v in d.items():
        names.append(k)
        o2[k] = v

In [40]:
o2

{'r2_linReg': 0.6143676318619615,
 'r2_lasso': 0.2859427748009815,
 'r2_lasso_2': 0.4628431806113248,
 'r2_ridge': 0.6143690521007702,
 'r2_ridge_2': 0.6143683520722076,
 'r2_reg_rf': 0.8147080268335121}

In [44]:
d2 = pd.DataFrame(o2, index=['R2 California House Prices'])

In [45]:
d2 = d2.T
d2

Unnamed: 0,R2 California House Prices
r2_linReg,0.614368
r2_lasso,0.285943
r2_lasso_2,0.462843
r2_ridge,0.614369
r2_ridge_2,0.614368
r2_reg_rf,0.814708


In [46]:
d3 = d2.sort_values(by='R2 California House Prices')
d3

Unnamed: 0,R2 California House Prices
r2_lasso,0.285943
r2_lasso_2,0.462843
r2_linReg,0.614368
r2_ridge_2,0.614368
r2_ridge,0.614369
r2_reg_rf,0.814708
