# Import the Libraries

In [229]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [230]:
# Gather the Data

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2] 
data = pd.DataFrame(data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO','B','LSTAT'])

In [231]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [232]:
features = data.drop(['INDUS', 'AGE'], axis = 1)

In [233]:
features.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33


In [234]:
boston_target = target

In [235]:
log_prices = np.log(target)

In [236]:
type(log_prices)

numpy.ndarray

In [237]:
log_prices.shape

(506,)

In [238]:
features.shape

(506, 11)

In [239]:
# Transforming the log prices into a dataframe
target = pd.DataFrame(log_prices, columns=['PRICE'])
target.head()

Unnamed: 0,PRICE
0,3.178054
1,3.072693
2,3.54674
3,3.508556
4,3.589059


In [240]:
target.shape

(506, 1)

In [241]:
## Assaining the dummy va
property_stats = np.ndarray(shape = (1, 11)) 

In [242]:
property_stats

array([[            nan, 0.00000000e+000, 1.23365143e-311,
        2.02369289e-320, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 4.26000000e-001]])

In [243]:
property_stats[0][0] = .02

In [244]:
property_stats

array([[2.00000000e-002, 0.00000000e+000, 1.23365143e-311,
        2.02369289e-320, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 4.26000000e-001]])

In [245]:
features.head()

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.9,5.33


In [246]:

CRIM_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8
"""
property_stats[0][CRIM_IDX] = features['CRIM'].mean()
property_stats[0][ZN_IDX] = features['ZN'].mean()
property_stats[0][CHAS_IDX] = features['CHAS'].mean()
property_stats[0][RM_IDX] = features['RM'].mean()
property_stats[0][PTRATIO_IDX] = features['PTRATIO'].mean()

property_stats

"""




"\nproperty_stats[0][CRIM_IDX] = features['CRIM'].mean()\nproperty_stats[0][ZN_IDX] = features['ZN'].mean()\nproperty_stats[0][CHAS_IDX] = features['CHAS'].mean()\nproperty_stats[0][RM_IDX] = features['RM'].mean()\nproperty_stats[0][PTRATIO_IDX] = features['PTRATIO'].mean()\n\nproperty_stats\n\n"

In [247]:
features.mean()

CRIM         3.613524
ZN          11.363636
CHAS         0.069170
NOX          0.554695
RM           6.284634
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
dtype: float64

In [248]:
type(features.mean())

pandas.core.series.Series

In [249]:
type(features.mean().values)

numpy.ndarray

In [250]:
features.mean().values.shape

(11,)

In [251]:
# Reshaping the feature's values shape
features.mean().values.reshape(1, 11)

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [252]:
property_stats = features.mean().values.reshape(1, 11)
property_stats

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [253]:
regr = LinearRegression().fit(features, target) 
fitted_vals = regr.predict(features)

# Calculate the MSE and RMSEv using sklearn

MSE = mean_squared_error(target, fitted_vals)
print(MSE)
RMSE = np.sqrt(MSE)
print(RMSE)

0.03516080084618688
0.18751213519713034


In [254]:
def get_log_estimate(nr_rooms,
                    student_per_classroom,
                    next_to_river = False,
                    high_confidence = True):
    # Configure Property
    property_stats[0][RM_IDX] = nr_rooms
    property_stats[0][PTRATIO_IDX] = student_per_classroom
    
    if next_to_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0
        
    # Make Prediction
    log_estimate = regr.predict(property_stats)[0][0]
    
    # Calculation Range
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lowe_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + RMSE
        lowe_bound = log_estimate - RMSE
        interval = 68
    return log_estimate, upper_bound, lowe_bound, interval

In [255]:
get_log_estimate(3, 20, next_to_river= True, high_confidence= False)



(2.7767581914803996, 2.96427032667753, 2.5892460562832693, 68)

In [256]:
np.median(boston_target)

21.2

In [257]:
# Converting the log price estimate using 1970's price
# Upper and lower bounds to today's price

zillow_median_price = 583.3
scale_factor = zillow_median_price / np.median(boston_target)

log_est, upper, lower, conf = get_log_estimate(9, student_per_classroom=15,
                                              next_to_river=False,
                                              high_confidence=False)

# Convert to today's price
dollar_estimate = np.e**log_est * 1000 * scale_factor
dollar_hi = np.e**upper * 1000 * scale_factor
dollar_low = np.e**lower * 1000 * scale_factor

# Round estimate 
rounded_est = round(dollar_estimate, -3)
rounded_hi = round(dollar_hi, -3)
rounded_low = round(dollar_low, -3)

print(f'The estimated property value is {rounded_est}')
print(f'At {conf}% confidence the valuation range is')
print(f'USD {rounded_low} at the lower end to USD {rounded_hi} at high end.')


The estimated property value is 827000.0
At 68% confidence the valuation range is
USD 685000.0 at the lower end to USD 997000.0 at high end.




In [258]:
def get_dollar_estimate(rm, ptratio, chas = False, large_range = True):
    
    """ Estimate the property value in Boston.
    
    Keywords arguemnts :
    
    rm -- number of rooms
    ptratio -- students per classroom in school in the area
    chas -- True if the property next to the river, False otherwise
    large_range -- True for 95% prediction interval, False for 68% prediction interval
    
    
    
    """
    
    # Setting Condition
    if rm < 1 or ptratio < 1:
        print('This is unrealistic. Try again.')
        return
    
    log_est, upper, lower, conf = get_log_estimate(rm,
                                                   student_per_classroom = ptratio,
                                                   next_to_river = chas,
                                                   high_confidence = large_range)

    # Convert to today's price
    dollar_estimate = np.e**log_est * 1000 * scale_factor
    dollar_hi = np.e**upper * 1000 * scale_factor
    dollar_low = np.e**lower * 1000 * scale_factor

    # Round estimate 
    rounded_est = round(dollar_estimate, -3)
    rounded_hi = round(dollar_hi, -3)
    rounded_low = round(dollar_low, -3)

    print(f'The estimated property value is {rounded_est}')
    print(f'At {conf}% confidence the valuation range is')
    print(f'USD {rounded_low} at the lower end to USD {rounded_hi} at high end.')

In [262]:
get_dollar_estimate(rm = 6, ptratio= 12, chas = True)

The estimated property value is 783000.0
At 95% confidence the valuation range is
USD 538000.0 at the lower end to USD 1139000.0 at high end.




In [263]:
get_dollar_estimate(6, 12, True)

The estimated property value is 783000.0
At 95% confidence the valuation range is
USD 538000.0 at the lower end to USD 1139000.0 at high end.


