## Step 01: Exploring The Data 
## 01. Melnourne Home Prices

In [None]:
## Step 01: Exploring Data

In [None]:
# import
import pandas as pd
# save filepath to variable for easier access
melbourne_file_path = './data/melbourne-housing-snapshot/melb_data.csv'
# read the data and store data in DataFrame titled melbourne_data
melbourne_home_data = pd.read_csv(melbourne_file_path) 

In [None]:
# print a summary of the data in Melbourne data
melbourne_home_data.describe()

In [None]:
# print first 5 rows of the data
melbourne_home_data.head()

## Step 02: Selecting Data for Modeling

In [None]:
melbourne_home_data.columns

## Drop Missing Values

In [None]:
# Visualize null entries first
import seaborn as sns
sns.heatmap(melbourne_home_data.isnull(), cbar=False)

In [None]:
melbourne_home_data = melbourne_home_data.dropna(axis=0)

In [None]:
melbourne_home_data.describe()

## Step 03: Selecting The Prediction Target

In [None]:
y = melbourne_home_data.Price

In [None]:
y

In [None]:
print(type(y))

## OR

In [None]:
melbourne_target = ['Price']
y = melbourne_home_data[melbourne_target]

In [None]:
y

In [None]:
print(type(y))

## Step 04: Choosing "Features"

In [None]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [None]:
X = melbourne_home_data[melbourne_features]

In [None]:
X.head()

In [None]:
X.describe()

## Step 05: Building Your Model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Step 1: Define model
melbourne_model = DecisionTreeRegressor(random_state=1) # random_state provides same result in each run : good practice
# Step 2: Fit model - Capture patterns from provided data. This is the heart of modeling.
melbourne_model.fit(X, y) 
# Step 3: Predict
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
pred = melbourne_model.predict(X.head()) # X.head() makes it in-sample prediction
print(pred)
# Step 4: Evaluate - Determine how accurate the model's predictions are.
'''    Confusion matrix
    Accuracy
    Precision
    Recall
    Specificity
    F1 score
    Precision-Recall or PR curve
    ROC (Receiver Operating Characteristics) curve
'''
from sklearn.metrics import mean_absolute_error,accuracy_score,precision_score,recall_score,f1_score
    
# MAE - takes abs(errors) ie |real-predicted| and takes average of all
mae = mean_absolute_error(y.head(), pred)  
print('MAE: %f' % mae)
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y.head(), pred) # y_true, y_pred
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y.head(), pred, average='micro')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y.head(), pred, average='micro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y.head(), pred, average='micro')
print('F1 score: %f' % f1)

In [None]:
## Common practice is to split the data after assigning features and target
from sklearn import model_selection
from sklearn.metrics import accuracy_score, mean_absolute_error

# Step 0: Split the model into train and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state=0) 
# Step 1: Define model
melbourne_model = DecisionTreeRegressor()
# Step 2: Fit model
melbourne_model.fit(X_train, y_train)
# Step 3: Predict
pred = melbourne_model.predict(X_test)
# Step 4: Evaluate
mae = mean_absolute_error(y_test, pred)
print('MAE: %f' % mae)


## Overfitting and Underfitting
<img src="https://i.imgur.com/2q85n9s.png" width=500 height=500 />

### Overfitting: Good performance on the training data, poor generliazation to other data.
### Underfitting: Poor performance on the training data and poor generalization to other data

Overfitting refers to a model that models the training data too well.

Overfitting happens when a model learns the detail and noise in the training data to the extent that it negatively impacts the performance of the model on new data. This means that the noise or random fluctuations in the training data is picked up and learned as concepts by the model. The problem is that these concepts do not apply to new data and negatively impact the models ability to generalize.

Underfitting refers to a model that can neither model the training data nor generalize to new data.

An underfit machine learning model is not a suitable model and will be obvious as it will have poor performance on the training data.

Ideally, you want to select a model at the sweet spot between underfitting and overfitting. But is very difficult to do in practice.

There are two important techniques that you can use when evaluating machine learning algorithms to limit overfitting:

Use a resampling technique to estimate model accuracy: The most popular resampling technique is k-fold cross validation. It allows you to train and test your model k-times on different subsets of training data and build up an estimate of the performance of a machine learning model on unseen data.

Hold back a validation dataset: A validation dataset is simply a subset of your training data that you hold back from your machine learning algorithms until the very end of your project. After you have selected and tuned your machine learning algorithms on your training dataset you can evaluate the learned models on the validation dataset to get a final objective idea of how the models might perform on unseen data. Using cross validation is a gold standard in applied machine learning for estimating model accuracy on unseen data. If you have the data, using a validation dataset is also an excellent practice.

# 02. Iowa Home data

In [None]:
# import
import pandas as pd
# save filepath to variable for easier access
iowa_file_path = './data/iowa_home_data/train.csv'
# read the data and store data in DataFrame titled melbourne_data
iowa_home_data = pd.read_csv(iowa_file_path) 
# print a summary of the data in Melbourne data
iowa_home_data.describe()

In [None]:
iowa_home_data.columns

In [None]:
import seaborn as sns
# Checking various null entries in the dataset, with the help of heatmap
sns.heatmap(iowa_home_data.isnull(), cbar=False)
# check the columns full of null values are relevant for features
# no need to drop those rows, just dont include them in features

In [None]:
# Extract three relevant columns from home data into a new dataframe
df = iowa_home_data[['YrSold', 'YearRemodAdd', 'YearBuilt']]
# Create a histogram from each column sharing the same x-axis
df.plot.hist(figsize=(12, 10), subplots=True, bins=50); # plot requires matplotlib

## how old is the newest home?

In [None]:
iowa_home_data[['YrSold','YearBuilt']].describe()

In [None]:
from datetime import date
current_year = date.today().year;
newest_home_age = (current_year - iowa_home_data.loc[:,'YearBuilt'].max()) #df.loc[rows:columns]
newest_home_age

## What is the average lot size (rounded to nearest integer)?

In [None]:
iowa_home_data[['LotArea']].describe()

In [None]:
avg_lot_size = iowa_home_data.loc[:,'LotArea'].mean()
avg_lot_size = round(avg_lot_size, 0)
avg_lot_size

In [None]:
y = iowa_home_data.SalePrice
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = iowa_home_data[feature_columns]


from sklearn.model_selection import train_test_split as split
X_train, X_test, y_train, y_test = split(X, y, random_state=0) 

from sklearn.tree import DecisionTreeRegressor
iowa_home_model = DecisionTreeRegressor()

iowa_home_model.fit(X_train, y_train)
y_pred = iowa_home_model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print('DecisionTreeRegressor MAE:', mean_absolute_error(y_test, y_pred))

from sklearn.linear_model import LinearRegression
iowa_home_model = LinearRegression()

iowa_home_model.fit(X_train, y_train)
y_pred = iowa_home_model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print('LinearRegression MAE:', mean_absolute_error(y_test, y_pred))