## House Pricing Dataset

In [1]:
%matplotlib inline
from sklearn.datasets import load_boston         #### Import the dataset from sklearn.

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = load_boston()
df

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [3]:
type(df)

sklearn.utils.Bunch

In [4]:
dataset = pd.DataFrame(df.data)
dataset.columns = df.feature_names
dataset['Price'] = df.target

dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
dataset.shape

(506, 14)

### Dividing the dataset into Independent and Dependent features

In [6]:
X = dataset.iloc[:,:-1]      ###  OR dataset.iloc[:, 0:-1] --> Independent Features
y = dataset.iloc[:,-1]       ### Dependent Features

In [7]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [8]:
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: Price, dtype: float64

### Linear Regression
- Its always a good idea to start with Ridge or Lasso Regression as there are a lot parameters that can be hypertuned when compared to Linear Regression.

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [10]:
Linear_model = LinearRegression()
mse = cross_val_score(Linear_model, X, y, scoring= 'neg_mean_squared_error', cv= 5)            ### Rather than using X and y it is better to use X_train and y_train.

> **Cross Validation** -> Divide the train and test data in such a way that every combination of the data is taken by the model and whoseever accuracy is better that is combined.

- Inorder to try with different cv values we can built a for loop

In [11]:
print(mse)

[-12.46030057 -26.04862111 -33.07413798 -80.76237112 -33.31360656]


- Here we get 5 MSE values as we are doing 5 cross validations. So what we do is take the mean of these 5 values.

In [12]:
mean_mse = np.mean(mse)
mean_mse

-37.13180746769922

In [13]:
### PREDICTION ###

# STEP 1: Fit the Model.
# STEP 2: Predit with the fitted Model.

### Ridge Regression (L1 Regularization)
- Here we have a lot of parameters to tune. 
- Here we will use 2 libraries.
    - 1 for calling the Ridge Regression.
    - 2 for performing Hyperparameter Tuning.

In [14]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV        ### Hyper-Parameter Tuning

In [15]:
Ridge_model = Ridge()       ### Parameter alpha, max_iter (Number of iterations we need to change the theta 1 value)

## Performing HyperParameter Tuning
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20]}
ridge_regression = GridSearchCV(Ridge_model,param_grid= parameters, scoring= 'neg_mean_squared_error', cv= 5)

ridge_regression.fit(X,y)         ### In real case fit on train data.

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20]},
             scoring='neg_mean_squared_error')

In [16]:
print(ridge_regression.best_params_)
print()
print(ridge_regression.best_score_)

{'alpha': 20}

-32.38025025182513


#### INFERENCE

- Here we get **-32.380** which is greater than **-37.131**, thus we can say that in this particular case Ridge Regression is not helping. Linear Regression appears to be a better model.
    - In order to get better performance we can do scaling.
- We know that Ridge Regression is capable of Reducing Overfitting. So we will try to increase the parameters.

In [17]:
Ridge_model = Ridge()       ### Parameter alpha, max_iter (Number of iterations we need to change the theta 1 value)

## Performing HyperParameter Tuning
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 100]}
ridge_regression = GridSearchCV(Ridge_model,param_grid= parameters, scoring= 'neg_mean_squared_error', cv= 5)

ridge_regression.fit(X,y)         ### In real case fit on train data.

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 45, 50, 55, 100]},
             scoring='neg_mean_squared_error')

In [18]:
print(ridge_regression.best_params_)
print()
print(ridge_regression.best_score_)

{'alpha': 100}

-29.90570194754033


### Lasso Regression (L2 Regularization)

In [19]:
from sklearn.linear_model import Lasso

In [20]:
Lasso_model = Lasso()       ### Parameter alpha, max_iter (Number of iterations we need to change the theta 1 value)

## Performing HyperParameter Tuning
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 100]}
lasso_regression = GridSearchCV(Lasso_model, param_grid= parameters, scoring= 'neg_mean_squared_error', cv= 5)

lasso_regression.fit(X,y)         ### In real case fit on train data.

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 45, 50, 55, 100]},
             scoring='neg_mean_squared_error')

In [21]:
print(lasso_regression.best_params_)
print()
print(lasso_regression.best_score_)

{'alpha': 1}

-35.531580220694856


## Splitting the data

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 42)

### Linear Regression

In [24]:
Linear_model = LinearRegression()
Linear_model.fit(X_train,y_train)
mse = cross_val_score(Linear_model, X_train, y_train, scoring= 'neg_mean_squared_error', cv= 5)            ### Rather than using X and y it is better to use X_train and y_train.

In [25]:
print(mse)

[-33.75185215 -21.4641199  -27.97099777 -17.7140812  -25.03832267]


In [26]:
mean_mse = np.mean(mse)
mean_mse

-25.187874739285057

### Ridge Regression

In [27]:
Ridge_model = Ridge()       ### Parameter alpha, max_iter (Number of iterations we need to change the theta 1 value)

## Performing HyperParameter Tuning
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 100]}
ridge_regression = GridSearchCV(Ridge_model,param_grid= parameters, scoring= 'neg_mean_squared_error', cv= 5)

ridge_regression.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 45, 50, 55, 100]},
             scoring='neg_mean_squared_error')

In [28]:
print(ridge_regression.best_params_)
print()
print(ridge_regression.best_score_)

{'alpha': 0.01}

-25.186899367386978


### Lasso Regression

In [29]:
Lasso_model = Lasso()       ### Parameter alpha, max_iter (Number of iterations we need to change the theta 1 value)

## Performing HyperParameter Tuning
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 100]}
lasso_regression = GridSearchCV(Lasso_model, param_grid= parameters, scoring= 'neg_mean_squared_error', cv= 5)

lasso_regression.fit(X_train, y_train)         ### In real case fit on train data.

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 45, 50, 55, 100]},
             scoring='neg_mean_squared_error')

In [30]:
print(lasso_regression.best_params_)
print()
print(lasso_regression.best_score_)

{'alpha': 1e-15}

-25.18787473928502


## Predictions on the Test data

In [31]:
y_pred_Linear = Linear_model.predict(X_test)
y_pred_Ridge = ridge_regression.predict(X_test)
y_pred_Lasso = lasso_regression.predict(X_test)

In [32]:
from sklearn.metrics import r2_score

In [33]:
print('R-squared for Linear',r2_score(y_test, y_pred_Linear))
print('R-squared for Ridge',r2_score(y_test, y_pred_Ridge))
print('R-squared for Lasso',r2_score(y_test, y_pred_Lasso))

R-squared for Linear 0.7261570836552478
R-squared for Ridge 0.7260978407192354
R-squared for Lasso 0.7261570836552473


> Since we are drawing a straight line we will not be getting 100%. Thus the accuracy is limited. In order to obtain that we have to work with models like XGBoost.