# Simplilearn Scikitlearn for Machine Learning
## Supervised Machine learning - Linear Regression

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_boston

In [3]:
import warnings
warnings.filterwarnings('ignore')
boston_dataset=load_boston()

In [4]:
print(boston_dataset['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
print(boston_dataset['feature_names'])

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [6]:
df=pd.DataFrame(boston_dataset.data)

In [7]:
df.columns=boston_dataset.feature_names

In [8]:
df.shape

(506, 13)

In [9]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## Linear Regression Model

In [10]:
#append price which is the target value as a new column
df['price']=boston_dataset.target

In [11]:
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'price'],
      dtype='object')

In [12]:
df.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6


In [13]:
#assign features on the x axis
x_features = boston_dataset.data

In [14]:
#assign target on the y axis
y_target=boston_dataset.target

In [15]:
#import linear model which is the estimator
from sklearn.linear_model import LinearRegression

In [16]:
linReg=LinearRegression()

In [17]:
#fit the data into the estimator
linReg.fit(x_features,y_target)

LinearRegression()

In [18]:
#print the intercept
#the %2f is used to round off the value obtained to 2 decimal places
print('The estimated intercept is: %.2f'% linReg.intercept_)

The estimated intercept is: 36.46


In [19]:
#print the coefficient
print('The coefficient values are: ',linReg.coef_)

The coefficient values are:  [-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]


In [20]:
#Train the model by splitting the data into train and test datasets
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(x_features,y_target)

In [21]:
#print the shape of the dataset
print(boston_dataset.data.shape)

(506, 13)


In [22]:
#print the shape of the training and testing datasets
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(379, 13) (127, 13) (379,) (127,)


In [23]:
#fit the training sets into the model
linReg.fit(X_train,Y_train)

LinearRegression()

In [24]:
#calculate the mean squared error (MSE) or the residual sum of errros
MSE=np.mean((linReg.predict(X_test)-Y_test)**2)
MSE

24.796452747304276

In [25]:
#calculate the variance
#the closer the value is to one the higher the accuracy
linReg.score(X_test,Y_test)

0.6459748491304176

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
Y_hat=linReg.predict(X_test)

In [28]:
mean_squared_error(Y_test,Y_hat)

24.796452747304276

In [29]:
MSE = np.square(np.subtract(Y_test,linReg.predict(X_test))).mean()

In [30]:
MSE

24.796452747304276