 """Analyze the Boston housing data. Evaluate and validate the
    performanance of a Decision Tree regressor on the housing data.
    Fine tune the model to make prediction on unseen data."""

"""Load the Boston dataset and examine its target (label) distribution."""

In [1]:
# Load libraries
import numpy as np
import pylab as pl
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor

In [44]:
#ADD EXTRA LIBRARIES HERE 
from sklearn import cross_validation

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

from sklearn.grid_search import GridSearchCV

In [18]:
boston = datasets.load_boston()

In [None]:
city_data = load_data()

In [25]:
print city_data.keys()

['data', 'feature_names', 'DESCR', 'target']


In [26]:
print city_data.DESCR    

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      


def explore_city_data(city_data):
Calculate and return the appropriate error performance metric.

In [37]:
# Get the labels and features from the housing data
housing_prices = city_data.target
housing_features = city_data.data

###################################
### Step 1. YOUR CODE GOES HERE ###
###################################

# Please calculate the following values using the Numpy library
print 'Size of data (number of houses)?',  len(city_data.data)
print 'Number of features?', len(city_data.feature_names)
print 'Minimum price?', np.min(housing_prices)
print 'Maximum price?', np.max(housing_prices)
print 'Calculate mean price?', np.mean(housing_prices)
print 'Calculate median price?', np.median(housing_prices)
print 'Calculate standard deviation?', np.std(housing_prices)

Size of data (number of houses)? 506
Number of features? 13
Minimum price? 5.0
Maximum price? 50.0
Calculate mean price? 22.5328063241
Calculate median price? 21.2
Calculate standard deviation? 9.18801154528



def split_data(city_data):
Randomly shuffle the sample set. Divide it into 70 percent training and 30 percent testing data.

Training/Test dataset split
    X_train, y_train, X_test, y_test = split_data(city_data)

In [48]:
# Get the features and labels from the Boston housing data
X, y = city_data.data, city_data.target

###################################
### Step 3. YOUR CODE GOES HERE ###
#http://scikit-learn.org/stable/modules/cross_validation.html
    
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0)
###################################

#return X_train, y_train, X_test, y_test


def performance_metric(label, prediction):
Calculate and return the appropriate error performance metric.

In [49]:
# http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics
###################################
### Step 2. YOUR CODE GOES HERE ###
def performance_metric(label, prediction):
    evs = explained_variance_score(label, prediction)
    mea = mean_absolute_error(label, prediction)
    mse = mean_squared_error(label, prediction)
    mae = median_absolute_error(label, prediction)
    r2  = r2_score(label, prediction)


    return evs, mea, mse, mae, r2
###################################


def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

In [50]:
# We will vary the training set size so that we have 50 different sizes
sizes = np.linspace(1, len(X_train), 50)
train_err = np.zeros(len(sizes))
test_err = np.zeros(len(sizes))

print "Decision Tree with Max Depth: "
print depth

for i, s in enumerate(sizes):

    # Create and fit the decision tree regressor model
    regressor = DecisionTreeRegressor(max_depth=depth)
    regressor.fit(X_train[:s], y_train[:s])

    # Find the performance on the training and testing set
    train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
    test_err[i] = performance_metric(y_test, regressor.predict(X_test))


# Plot learning curve graph
learning_curve_graph(sizes, train_err, test_err)

Decision Tree with Max Depth: 


NameError: name 'depth' is not defined

In [None]:

def learning_curve_graph(sizes, train_err, test_err):
    """Plot training and test error as a function of the training size."""
    
     

In [8]:
#Learning Curve Graphs
    max_depths = [1,2,3,4,5,6,7,8,9,10]
    for max_depth in max_depths:
        learning_curve(max_depth, X_train, y_train, X_test, y_test)

        pl.figure()
        pl.title('Decision Trees: Performance vs Training Size')
        pl.plot(sizes, test_err, lw=2, label = 'test error')
        pl.plot(sizes, train_err, lw=2, label = 'training error')
        pl.legend()
        pl.xlabel('Training Size')
        pl.ylabel('Error')
        pl.show()

In [None]:
def model_complexity(X_train, y_train, X_test, y_test):
    """Calculate the performance of the model as model complexity increases."""
# Model Complexity Graph
    model_complexity(X_train, y_train, X_test, y_test)

In [9]:


    
print "Model Complexity: "

# We will vary the depth of decision trees from 2 to 25
max_depth = np.arange(1, 25)
train_err = np.zeros(len(max_depth))
test_err = np.zeros(len(max_depth))

for i, d in enumerate(max_depth):
    # Setup a Decision Tree Regressor so that it learns a tree with depth d
    regressor = DecisionTreeRegressor(max_depth=d)

    # Fit the learner to the training data
    regressor.fit(X_train, y_train)

    # Find the performance on the training set
    train_err[i] = performance_metric(y_train, regressor.predict(X_train))

    # Find the performance on the testing set
    test_err[i] = performance_metric(y_test, regressor.predict(X_test))

# Plot the model complexity graph
model_complexity_graph(max_depth, train_err, test_err)



def model_complexity_graph(max_depth, train_err, test_err):
    """Plot training and test error as a function of the depth of the decision tree learn."""

In [10]:
pl.figure()
pl.title('Decision Trees: Performance vs Max Depth')
pl.plot(max_depth, test_err, lw=2, label = 'test error')
pl.plot(max_depth, train_err, lw=2, label = 'training error')
pl.legend()
pl.xlabel('Max Depth')
pl.ylabel('Error')
pl.show()


def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data.""" 
Tune and predict Model
    fit_predict_model(city_data) 

In [11]:
# Get the features and labels from the Boston housing data
X, y = city_data.data, city_data.target

# Setup a Decision Tree Regressor
regressor = DecisionTreeRegressor()

parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

###################################
### Step 4. YOUR CODE GOES HERE ###
###################################

# 1. Find the best performance metric
# should be the same as your performance_metric procedure
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html

# 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
# http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV

# Fit the learner to the training data
print "Final Model: "
print reg.fit(X, y)

# Use the model to predict the output of a particular sample
x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
y = reg.predict(x)
print "House: " + str(x)
print "Prediction: " + str(y)

