**Linear Regression on Boston Housing Dataset**


In [0]:
#Importing the required librarires:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

In [0]:
#Importing the dataset:
from sklearn.datasets import load_boston
data = load_boston()


In [0]:
#Create a Dataframe and print the dataset:
df = pd.DataFrame(data.data,columns=data.feature_names)
df.head() # print out the first five elements of the data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [0]:
df.describe() #Used to describe different parameters like mean , standard deviation etc. of the data.

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [0]:
df['MEDV'] = data.target    #Adding the target varuable MEDV to the data.

In [0]:
#We will perform Simple Linear Regression, using the values of lstat as the predictor:
X = pd.DataFrame(np.c_[df['LSTAT'], df['RM']], columns = ['LSTAT','RM'])
y = df['MEDV']
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)

(379, 2)
(379,)


In [0]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [0]:
y_pred = lr.predict(X_test)

In [0]:
mse = np.mean((y_pred-y_test))**2
mse

0.0008626401448249967

In [0]:
coeff = DataFrame(X_train.columns)

coeff['Coefficient Estimate'] = Series(lr.coef_)

coeff



Unnamed: 0,0,Coefficient Estimate
0,LSTAT,-0.656603
1,RM,4.827543


**Multiple Linear Regression**

In [0]:
#In multiple Linear Regression, we will be using all the predictor variables to predict the target variable:

X = pd.DataFrame(np.c_[df['LSTAT'], df['RM'],df['CRIM'],df['ZN'],df['INDUS'],df['CHAS'],df['NOX'],df['AGE'],df['DIS'],df['RAD'],df['TAX'],df['PTRATIO'],df['B']], columns = ['LSTAT','RM','CRIM','ZN','INDUS','CHAS','NOX','AGE','DIS','RAD','TAX','PTRATIO','B'])
y = df['MEDV']
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)

(379, 13)
(379,)


In [0]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [0]:
y_pred = lr.predict(X_test)

In [0]:
mse = np.mean((y_pred-y_test))**2
mse

1.1622450701768692

In [0]:
coeff = DataFrame(X_train.columns)

coeff['Coefficient Estimate'] = Series(lr.coef_)

coeff




Unnamed: 0,0,Coefficient Estimate
0,LSTAT,-0.455755
1,RM,4.271652
2,CRIM,-0.100247
3,ZN,0.043916
4,INDUS,-0.033247
5,CHAS,3.141
6,NOX,-11.053903
7,AGE,-0.017691
8,DIS,-1.390644
9,RAD,0.26246


**Interaction terms and Non-Linear Transformations**

In [0]:
df['Value'] = df.RM*df.LSTAT   # Interaction term between RM and LSTAT
X = pd.DataFrame(np.c_[df['LSTAT'], df['RM'],df['Value']], columns = ['LSTAT','RM','Value'])
y = df['MEDV']
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)

(379, 3)
(379,)


In [0]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [0]:
y_pred = lr.predict(X_test)
mse = np.mean((y_pred-y_test))**2
mse


0.001045480557939052

In [0]:
coeff = DataFrame(X_train.columns)

coeff['Coefficient Estimate'] = Series(lr.coef_)

coeff

Unnamed: 0,0,Coefficient Estimate
0,LSTAT,2.04515
1,RM,9.431551
2,Value,-0.460535


In [0]:
#Non-Linear Terms or Polynomial Features:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X = pd.DataFrame(np.c_[df['LSTAT'], df['RM'],df['CRIM'],df['ZN'],df['INDUS'],df['CHAS'],df['NOX'],df['AGE'],df['DIS'],df['RAD'],df['TAX'],df['PTRATIO'],df['B']], columns = ['LSTAT','RM','CRIM','ZN','INDUS','CHAS','NOX','AGE','DIS','RAD','TAX','PTRATIO','B'])
y = df['MEDV']
poly.fit(X)
X_poly = poly.transform(X)
X_train,X_test,y_train,y_test = train_test_split(X_poly,y)
print(X_train.shape)
print(y_train.shape)

(379, 105)
(379,)


In [0]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [0]:
y_pred = lr.predict(X_test)
mse = np.mean((y_pred-y_test))**2
mse



0.024271096318488714

**Qualitative Predictors**

In [0]:
from google.colab import files
uploaded = files.upload()

Saving Carseats.csv to Carseats.csv


In [0]:
import io
df = pd.read_csv(io.BytesIO(uploaded['Carseats.csv']))

In [0]:
df.head()

Unnamed: 0.1,Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,1,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,2,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,3,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,4,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,5,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [0]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.5,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,115.614301,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,1.0,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,100.75,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,200.5,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,300.25,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,400.0,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


In [0]:
df_dummies = pd.get_dummies(df) #Generating dummy variables for categorical variables.
df_dummies.head()

Unnamed: 0.1,Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
0,1,9.5,138,73,11,276,120,42,17,1,0,0,0,1,0,1
1,2,11.22,111,48,16,260,83,65,10,0,1,0,0,1,0,1
2,3,10.06,113,35,10,269,80,59,12,0,0,1,0,1,0,1
3,4,7.4,117,100,4,466,97,55,14,0,0,1,0,1,0,1
4,5,4.15,141,64,3,340,128,38,13,1,0,0,0,1,1,0


In [0]:

X = pd.DataFrame(np.c_[df_dummies['CompPrice'], df_dummies['Income'],df_dummies['Advertising'],df_dummies['Population'],df_dummies['Price'],df_dummies['Age'],df_dummies['Education'],df_dummies['ShelveLoc_Bad'],df_dummies['ShelveLoc_Good'],df_dummies['ShelveLoc_Medium'],df_dummies['Urban_No'],df_dummies['Urban_Yes'],df_dummies['US_No'],df_dummies['US_Yes']], columns = ['CompPrice','Income','Advertising','Population','Price','Age','Education','ShelveLoc_Bad','ShelveLoc_Good','ShelveLoc_Medium','Urban_No','Urban_Yes','US_No','US_Yes'])
y = df_dummies['Sales']
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)

(300, 14)
(300,)
