In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn library
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = fetch_california_housing()
x = pd.DataFrame(data.data, columns = data.feature_names)
y = data.target

x.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [23]:
# train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

scale = StandardScaler()
x_train_scaled = scale.fit_transform(x_train)
x_test_scaled = scale.transform(x_test)
print(x_train_scaled,'\n\n')
print('y_train data: ',y_train)

[[-0.326196    0.34849025 -0.17491646 ...  0.05137609 -1.3728112
   1.27258656]
 [-0.03584338  1.61811813 -0.40283542 ... -0.11736222 -0.87669601
   0.70916212]
 [ 0.14470145 -1.95271028  0.08821601 ... -0.03227969 -0.46014647
  -0.44760309]
 ...
 [-0.49697313  0.58654547 -0.60675918 ...  0.02030568 -0.75500738
   0.59946887]
 [ 0.96545045 -1.07984112  0.40217517 ...  0.00707608  0.90651045
  -1.18553953]
 [-0.68544764  1.85617335 -0.85144571 ... -0.08535429  0.99543676
  -1.41489815]] 


y_train data:  [1.03  3.821 1.726 ... 2.221 2.835 3.25 ]


In [6]:
# Linear  models
# 1 simple linear regression
lin_reg = LinearRegression()
lin_reg.fit(x_train_scaled, y_train)
lreg_train_score = lin_reg.score(x_train_scaled, y_train)
y_pred_reg = lin_reg.predict(x_test_scaled)
lreg_test_score = r2_score(y_test, y_pred_reg)
mse_reg = mean_squared_error(y_test,y_pred_reg)

# 2 Ridge regression
ridge_reg = Ridge(alpha = 10) # L-2 penalty
ridge_reg.fit(x_train_scaled, y_train)
ridge_train_score = ridge_reg.score(x_train_scaled, y_train)
y_pred_ridge = ridge_reg.predict(x_test_scaled)
ridge_test_score = r2_score(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# 3 Lasso regression
lasso_reg = Lasso(alpha = 0.01) # L-1 penalty
lasso_reg.fit(x_train_scaled, y_train)
lasso_train_score = lasso_reg.score(x_train_scaled, y_train)
y_pred_lasso = lasso_reg.predict(x_test_scaled)
lasso_test_score = r2_score(y_test,y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)


In [8]:
# Metrics
# Mean Square Error
print("Mean Square Error")
print('Linear Regression MSE: ',round(mse_reg,4))
print('Ridge Regression MSE: ',round(mse_ridge,4))
print('Lasso Regression MSE: ',round(mse_lasso,4))

# Coefficients of the model
print('\nCoefficints of the model\n')
print('Linear Regression Coefficient: ', lin_reg.coef_)
print('Ridge Regression Coefficient: ', ridge_reg.coef_)
print('Lasso Regression Coefficient: ', lasso_reg.coef_)

# Non-zero coefficient
print('\nNon- Zero coefficient\n')
print('No of non-zero coefficient in linear regression: ',np.sum(lin_reg.coef_!= 0))
print('No of non-zero coefficient in ridge regression:  ',np.sum(ridge_reg.coef_!= 0))
print('No of non-zero coefficient in lasso regression: ',np.sum(lasso_reg.coef_!= 0))

Mean Square Error
Linear Regression MSE:  0.5559
Ridge Regression MSE:  0.5555
Lasso Regression MSE:  0.5483

Coefficints of the model

Linear Regression Coefficient:  [ 0.85438303  0.12254624 -0.29441013  0.33925949 -0.00230772 -0.0408291
 -0.89692888 -0.86984178]
Ridge Regression Coefficient:  [ 0.85381377  0.12331557 -0.2924118   0.33674955 -0.0020552  -0.04086734
 -0.88938399 -0.86219926]
Lasso Regression Coefficient:  [ 0.80095744  0.12708701 -0.16275931  0.20620745 -0.         -0.03060176
 -0.79011254 -0.75567379]

Non- Zero coefficient

No of non-zero coefficient in linear regression:  8
No of non-zero coefficient in ridge regression:   8
No of non-zero coefficient in lasso regression:  7


In [9]:
# Train test score
# Linear Regression
print('Linear Regression: ')
print('Linear Regression training score: ', round(lreg_train_score*100, 2))
print('Linear Regression testing score: ',round(lreg_test_score*100,2))

# Ridge Regression
print('\nRidge Regression\n')
print('Ridge Regression training score: ', round(ridge_train_score*100, 2))
print('Ridge Regression testing score: ',round(ridge_test_score*100,2))

# Lasso Regression
print('\nLasso Regression\n')
print('Lasso Regression training score: ', round(lasso_train_score*100, 2))
print('Lasso Regression testing score: ',round(lasso_test_score*100,2))

Linear Regression: 
Linear Regression training score:  61.26
Linear Regression testing score:  57.58

Ridge Regression

Ridge Regression training score:  61.25
Ridge Regression testing score:  57.61

Lasso Regression

Lasso Regression training score:  60.85
Lasso Regression testing score:  58.16


In [10]:
# Elastic net model
elastic_net = ElasticNet(alpha = 0.01, l1_ratio = 0.5)
elastic_net.fit(x_train_scaled, y_train)
print('Training Score: ',round(elastic_net.score(x_train_scaled,y_train)*100,2))
y_pred_elastic = elastic_net.predict(x_test_scaled)
print('Testing Score: ', round(r2_score(y_test, y_pred_elastic)*100,2))
print('Mean Square Error: ',round(mean_squared_error(y_test, y_pred_elastic),4))


Training Score:  61.01
Testing Score:  58.03
Mean Square Error:  0.55


In [12]:
# simple polynomial regression
degree = 2
poly_feature = PolynomialFeatures(degree = degree)
x_train_poly = poly_feature.fit_transform(x_train_scaled)
x_test_poly = poly_feature.transform(x_test_scaled)

In [13]:
# converting the scaled data into data frame
x_train_df = pd.DataFrame(x_train_poly)
x_test_df = pd.DataFrame(x_test_poly)

# 2nd order polynomial data
x_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,1.0,-0.326196,0.34849,-0.174916,-0.208365,0.768276,0.051376,-1.372811,1.272587,0.106404,...,0.590248,0.039471,-1.054698,0.977698,0.00264,-0.07053,0.065381,1.884611,-1.747021,1.619477
1,1.0,-0.035843,1.618118,-0.402835,-0.12853,-0.098901,-0.117362,-0.876696,0.709162,0.001285,...,0.009781,0.011607,0.086706,-0.070137,0.013774,0.102891,-0.083229,0.768596,-0.62172,0.502911
2,1.0,0.144701,-1.95271,0.088216,-0.257538,-0.449818,-0.03228,-0.460146,-0.447603,0.020939,...,0.202336,0.01452,0.206982,0.20134,0.001042,0.014853,0.014448,0.211735,0.205963,0.200349
3,1.0,-1.017864,0.586545,-0.600015,-0.145156,-0.007434,0.077507,-1.382172,1.232698,1.036048,...,5.5e-05,-0.000576,0.010276,-0.009164,0.006007,-0.107128,0.095543,1.910399,-1.703801,1.519545
4,1.0,-0.171488,1.142008,0.349007,0.086624,-0.485877,-0.068832,0.532084,-0.108551,0.029408,...,0.236077,0.033444,-0.258527,0.052743,0.004738,-0.036624,0.007472,0.283113,-0.057758,0.011783


In [14]:
# Linear model on polynomial data
model = LinearRegression()
model.fit(x_train_df, y_train)
print('Training Score: ', round(model.score(x_train_df, y_train)*100,2))
y_pred_poly = model.predict(x_test_df)
print('Testing Score: ',round(r2_score(y_test,y_pred_poly)*100,2))
print('Mean Square Error: ',round(mean_squared_error(y_test,y_pred_poly),4))

Training Score:  68.53
Testing Score:  64.57
Mean Square Error:  0.4643
