In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [5]:
#Gather Data
boston_dataset = load_boston()

data = pd.DataFrame(data = boston_dataset.data, columns = boston_dataset.feature_names)
features = data.drop(['INDUS','AGE'],axis=1)
log_prices = np.log(boston_dataset.target)

log_prices.shape  # --> array
target = pd.DataFrame(log_prices, columns = ['PRICE'])
target.shape

(506, 1)

In [6]:
CRIM_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8

#property_stats = np.ndarray(shape=(1,11))
#property_stats[0][CRIM_IDX] = features['CRIM'].mean()
#property_stats[0][ZN_IDX] = features['ZN'].mean()
#property_stats[0][CHAS_IDX] = features['CHAS'].mean()

property_stats = features.mean().values.reshape(1,11)
# So we have a property, a single row, 11 features, they all have a value

# and in this case, the value is just the average of all the 506 properties in the dataset.

In [7]:
features.mean() # series

CRIM         3.613524
ZN          11.363636
CHAS         0.069170
NOX          0.554695
RM           6.284634
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
dtype: float64

In [8]:
type(features.mean().values)

numpy.ndarray

In [9]:
features.mean().values.reshape(1,11)

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [10]:
regr = LinearRegression()

In [11]:
regr.fit(features, target)

In [12]:
fitted_vals=regr.predict(features)

Calculate MSE,RMSE using sickit learn

from sklearn.metrics import mean_squared_error

print('Mean Square error is:',mean_squared_error(target,fitted_vals))
print('Root Mean Square error is:',np.sqrt(mean_squared_error(target,fitted_vals)))

or below code

In [13]:
MSE = mean_squared_error(target,fitted_vals)
RMSE = np.sqrt(MSE)

In [None]:
# create a function which estimate log house prices for a specific property 

In [14]:
# review function code
def get_log_estimate():
    
    log_estimate = 21
    
    return log_estimate

In [15]:
get_log_estimate()

21

In [16]:
# review function code with arguments
# if our function has a argument we need to provide arguments in the call as well
# Here we make two of these arguments optional 
def get_log_estimate(nr_rooms,
                    students_per_classroom,
                    next_to_river = False,
                    high_confidence = True):
    
    log_estimate = 21
    
    return log_estimate

In [17]:
get_log_estimate(5,19)

21

In [None]:
 #What we want is a function to return the price estimate for a particular property
 # so we're going to use our regression object, regr, and the predict method on that object
 #and then, as an argument to the predict method, we need to supply a single row of features. 
    
 # That single row will be our property_stats, which currently hold on to the average values for all the features in

 # our dataset.

In [37]:
def get_log_estimate(nr_rooms,
                    students_per_classroom,
                    next_to_river= False,
                    high_confidence = True):
    
    # configure property
    property_stats[0][RM_IDX] = nr_rooms
    property_stats[0][PTRATIO_IDX] = students_per_classroom
    
    if next_to_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0    
    
    
    # Make prediction
    log_estimate = regr.predict(property_stats)[0][0]
    
    # Calculate Range 
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + 1*RMSE
        lower_bound = log_estimate - 1*RMSE
        interval = 68
    
    return log_estimate,upper_bound ,lower_bound, interval

In [38]:
get_log_estimate(3,20,next_to_river= True) #now the high_confidence  is true -> 95

(2.7767581914803996, 3.15178246187466, 2.401733921086139, 95)

In [39]:
np.median(boston_dataset.target)

21.2

In [53]:
# Challenge: Write the code that converts the log price estimate using 1970s prices
# as well as lower bound upper bound to today's price
#round the value

#from zillow website
ZILLOW_MEDIAN_PRICE = 583.3
SCALE_FACTOR = ZILLOW_MEDIAN_PRICE /np.median(boston_dataset.target)
SCALE_FACTOR

# call Function
log_est, upper, lower, conf = get_log_estimate(9,students_per_classroom = 15,
                                              next_to_river= False, high_confidence= False )

# Convert to today's dollars --> reversing the log transformation
dollar_est= np.e**log_est *1000 *SCALE_FACTOR
dollar_hi= np.e**upper *1000 *SCALE_FACTOR
dollar_low= np.e**lower *1000 *SCALE_FACTOR

# Round the dollar values to nearest thousand 
rounded_est = np.around(dollar_est)
rounded_hi = np.around(dollar_hi)
rounded_low = np.around(dollar_low)

print(f'The estimated property value is: {rounded_est}.')
print(f'At {conf}% the confidence the valuation range is')
print(f'USD:{rounded_low} at the lower end to {rounded_hi} at the high end.')

The estimated property value is: 826729.0.
At 68% the confidence the valuation range is
USD:685374.0 at the lower end to 997237.0 at the high end.


In [69]:
def get_dollar_estimate(rm,ptratio, chas=False, large_range=True):
    """ Estimate the price of a property in boston
    keyword arguments:
     rm-- number of rooms in the property
     ptratio -- number of students per teacher in the classromm for the school in the area
     chas -- True if the property is next to the river, False otherwise
     large_range -- True for 95% prediction interval, False for a 68% interval
    """
    if rm < 1 or ptratio < 1:
        print('That is unrealistic. Try again')
        return
    
    log_est, upper, lower, conf = get_log_estimate(rm,
                                                   ptratio,    
                                                   next_to_river= chas, 
                                                   high_confidence=large_range)
   
    # Convert to today's dollars --> reversing the log transformation
    dollar_est= np.e**log_est *1000 *SCALE_FACTOR
    dollar_hi= np.e**upper *1000 *SCALE_FACTOR
    dollar_low= np.e**lower *1000 *SCALE_FACTOR

    # Round the dollar values to nearest thousand 
    rounded_est = np.around(dollar_est, -3)
    rounded_hi = np.around(dollar_hi, -3)
    rounded_low = np.around(dollar_low, -3)

    print(f'The estimated property value is: {rounded_est}.')
    print(f'At {conf}% the confidence the valuation range is')
    print(f'USD:{rounded_low} at the lower end to {rounded_hi} at the high end.')

In [67]:
get_dollar_estimate(rm=5, ptratio =-60 , chas=True)


The estimated property value is: 1079000.0.
At 95% the confidence the valuation range is
USD:742000.0 at the lower end to 1570000.0 at the high end.


In [None]:
# how we can package what is in this Jupyter notebook here as a Python module -> I save as module then i can import it

In [71]:
import boston_valuation as val
val.get_dollar_estimate(6,12,True)

The estimated property value is: 783000.0.
At 95% the confidence the valuation range is
USD:538000.0 at the lower end to 1139000.0 at the high end.
