# Import the Libraries

In [79]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [80]:
# Gather the Data

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2] 
data = pd.DataFrame(data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO','B','LSTAT'])

In [81]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [82]:
features = data.drop(['INDUS', 'AGE'], axis = 1)

In [83]:
features.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33


In [84]:
log_prices = np.log(target)

In [85]:
type(log_prices)

numpy.ndarray

In [86]:
log_prices.shape

(506,)

In [87]:
features.shape

(506, 11)

In [88]:
# Transforming the log prices into a dataframe
target = pd.DataFrame(log_prices, columns=['PRICE'])
target.head()

Unnamed: 0,PRICE
0,3.178054
1,3.072693
2,3.54674
3,3.508556
4,3.589059


In [89]:
target.shape

(506, 1)

In [90]:
## Assaining the dummy va
property_stats = np.ndarray(shape = (1, 11)) 

In [91]:
property_stats

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [94]:
property_stats[0][0] = .02

In [95]:
property_stats

array([[0.02, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ]])

In [96]:
features.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33


In [98]:
"""
CRIM_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8

property_stats[0][CRIM_IDX] = features['CRIM'].mean()
property_stats[0][ZN_IDX] = features['ZN'].mean()
property_stats[0][CHAS_IDX] = features['CHAS'].mean()
property_stats[0][RM_IDX] = features['RM'].mean()
property_stats[0][PTRATIO_IDX] = features['PTRATIO'].mean()

property_stats

"""




"\nCRIM_IDX = 0\nZN_IDX = 1\nCHAS_IDX = 2\nRM_IDX = 4\nPTRATIO_IDX = 8\n\nproperty_stats[0][CRIM_IDX] = features['CRIM'].mean()\nproperty_stats[0][ZN_IDX] = features['ZN'].mean()\nproperty_stats[0][CHAS_IDX] = features['CHAS'].mean()\nproperty_stats[0][RM_IDX] = features['RM'].mean()\nproperty_stats[0][PTRATIO_IDX] = features['PTRATIO'].mean()\n\nproperty_stats\n\n"

In [99]:
features.mean()

CRIM         3.613524
ZN          11.363636
CHAS         0.069170
NOX          0.554695
RM           6.284634
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
dtype: float64

In [100]:
type(features.mean())

pandas.core.series.Series

In [101]:
type(features.mean().values)

numpy.ndarray

In [102]:
features.mean().values.shape

(11,)

In [104]:
# Reshaping the feature's values shape
features.mean().values.reshape(1, 11)

array([[  3.61352356,  11.36363636,   0.06916996,   0.55469506,
          6.28463439,   3.79504269,   9.54940711, 408.23715415,
         18.4555336 , 356.67403162,  12.65306324]])

In [107]:
property_stats = features.mean().values.reshape(1, 11)
property_stats

array([[  3.61352356,  11.36363636,   0.06916996,   0.55469506,
          6.28463439,   3.79504269,   9.54940711, 408.23715415,
         18.4555336 , 356.67403162,  12.65306324]])