In [1]:
import numpy as np 
import pandas as pd
from sklearn import datasets, model_selection

In [2]:
#loading Boston dataset from sklearn
boston_data = datasets.load_boston()

In [3]:
#description of data
print(boston_data.DESCR)     

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
X = boston_data.data
Y = boston_data.target
print(X.shape, Y.shape)

(506, 13) (506,)


In [5]:
X_df = pd.DataFrame(X)
Y_df = pd.DataFrame(Y)
X_df.head(8)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15


In [6]:
Y_df.head(8)

Unnamed: 0,0
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
5,28.7
6,22.9
7,27.1


In [7]:
#getting information about data like mean, max value, min value, std. deviation,etc..
X_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [8]:
#splitting into train and test data
x_train, x_test, y_train, y_test = model_selection.train_test_split(X_df, Y_df,test_size = 0.3, random_state = 0)

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
#creating LR regressor object
clf01 = LinearRegression()
clf01.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [11]:
train_score01 = clf01.score(x_train, y_train)
test_score01 = clf01.score(x_test, y_test)
train_score01, test_score01

(0.7645451026942549, 0.6733825506400194)

# Achieving Complex Boundaries

In [12]:
#columns in data
X_df.columns = boston_data.feature_names
print(X_df.columns)
len(X_df.columns)

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')


13

In [13]:
#getting the square(col^2) for each column in X to acquire more complex boundary
#adding 2 degree to the data
for col in X_df.columns:
    i = col + "^2"
    X_df[i] = X_df[col]**2
len(X_df.columns)

26

In [14]:
X_df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'CRIM^2', 'ZN^2', 'INDUS^2', 'CHAS^2', 'NOX^2',
       'RM^2', 'AGE^2', 'DIS^2', 'RAD^2', 'TAX^2', 'PTRATIO^2', 'B^2',
       'LSTAT^2'],
      dtype='object')

In [18]:
x2_train, x2_test, y2_train, y2_test = model_selection.train_test_split(X_df, Y_df,test_size = 0.3, random_state = 0)

In [19]:
#regressor after adding 2 degree columns
clf02 = LinearRegression()
clf02.fit(x2_train, y2_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [20]:
train_score02 = clf02.score(x2_train, y2_train)
test_score02 = clf02.score(x2_test, y2_test)
train_score02, test_score02

(0.8579517538985478, 0.7484908225489381)

In [21]:
#getting more 2 degree columns
X_df["CRIM*ZN"] = X_df.CRIM*X_df.ZN
X_df["CRIM*INDUS"] = X_df.CRIM*X_df.INDUS
X_df["CRIM*CHAS"] = X_df.CRIM*X_df.CHAS
X_df["CRIM*NOX"] = X_df.CRIM*X_df.NOX
X_df["CRIM*RM"] = X_df.CRIM*X_df.RM
X_df["CRIM*AGE"] = X_df.CRIM*X_df.AGE
X_df["CRIM*DIS"] = X_df.CRIM*X_df.DIS
X_df["CRIM*RAD"] = X_df.CRIM*X_df.RAD
X_df["CRIM*TAX"] = X_df.CRIM*X_df.TAX
X_df["CRIM*PTRATIO"] = X_df.CRIM*X_df.PTRATIO
X_df["CRIM*B"] = X_df.CRIM*X_df.B
X_df["CRIM*LSTAT"] = X_df.CRIM*X_df.LSTAT

In [22]:
print(len(X_df.columns))
X_df.columns

38


Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'CRIM^2', 'ZN^2', 'INDUS^2', 'CHAS^2', 'NOX^2',
       'RM^2', 'AGE^2', 'DIS^2', 'RAD^2', 'TAX^2', 'PTRATIO^2', 'B^2',
       'LSTAT^2', 'CRIM*ZN', 'CRIM*INDUS', 'CRIM*CHAS', 'CRIM*NOX', 'CRIM*RM',
       'CRIM*AGE', 'CRIM*DIS', 'CRIM*RAD', 'CRIM*TAX', 'CRIM*PTRATIO',
       'CRIM*B', 'CRIM*LSTAT'],
      dtype='object')

In [23]:
x3_train, x3_test, y3_train, y3_test = model_selection.train_test_split(X_df, Y_df,test_size = 0.3, random_state = 0)

In [24]:
#regressor object after adding more 2 degree column
clf03 = LinearRegression()
clf03.fit(x3_train, y3_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [25]:
train_score03 = clf03.score(x3_train, y3_train)
test_score03 = clf03.score(x3_test, y3_test)
train_score03, test_score03

(0.874982573990824, 0.7376244073208451)