In [None]:
import pandas as pd

# save filepath to variable for easier access
melbourne_file_path = 'melb_data.csv'

# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path) 

# print a summary of the data in Melbourne data
melbourne_data.describe()

In [None]:
melbourne_data.columns

In [None]:
# The Melbourne data has some missing values (some houses for which some variables weren't recorded.)
# We'll learn to handle missing values in a later tutorial.  
# Your Iowa data doesn't have missing values in the columns you use. 
# So we will take the simplest option for now, and drop houses from our data. 
# Don't worry about this much for now, though the code is:

# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)

## Selecting Data for Modeling

In [None]:
# Selecting The Prediction Target

y = melbourne_data.Price ## create Series

In [None]:
# Choosing "Features"

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
X.describe()
# X.head()

## Building Your Model

- Define: 
    - What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
- Fit: 
    - Capture patterns from provided data. This is the heart of modeling.
- Predict: 
    - Just what it sounds like
- Evaluate: 
    - Determine how accurate the model's predictions are.

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run

# It doesn't matter if the random_state is 0 or 1 or any other integer. 
# What matters is that it should be set the same value, if you want to validate your processing over multiple runs of the code. 
# By the way I have seen random_state=42 used in many official examples of scikit as well as elsewhere also.
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)

In [None]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))

## Model Validation

Mean Absolute Error (also called MAE)

```
error=actual−predicted
```

With the MAE metric, we take the absolute value of each error. This converts each error to a positive number. We then take the average of those absolute errors.

In [None]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X) ## create Series
mean_absolute_error(y, predicted_home_prices) ## compare two Series

## !!! "In-Sample" Scores

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Define model
melbourne_model = DecisionTreeRegressor()

# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

## Underfitting and Overfitting

- **Overfitting:** capturing spurious patterns that won't recur in the future, leading to less accurate predictions, or
- **Underfitting:** failing to capture relevant patterns, again leading to less accurate predictions.

We use validation data, which isn't used in model training, to measure a candidate model's accuracy. This lets us try many candidate models and keep the best one.

In [None]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

## Random Forests

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))


Some visualization...

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

melbourne_data['YearBuilt'].value_counts().plot(kind='bar');
plt.xlabel("Year Built", labelpad=14)
plt.ylabel("Count of Houses", labelpad=14)
plt.title("Count of Houses Build by Year", y=1.02);