## Basic Syntax

In [3]:
import pandas as pd
# save filepath to variable for easier access
housing_file_path = '/Users/ChesterHuynh/learningml/Kaggle/Tutorials/house-prices-advanced-regression-techniques/train.csv'
# read the data and store data in DataFrame titlted housing_data
housing_data = pd.read_csv(housing_file_path)
# print a summary of the data in housing_data
# shows the count, i.e. number non-empty values, and the statistics
print(housing_data.describe())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726   
std       1.112799    30.202904     20.645407   181.066207   456.098091   
min       1.000

In [4]:
# can get all the column names by using the columns method of the housing_data object
print(housing_data.columns)

# we can access the data of any variable/column with DOT notation
print(housing_data.SalePrice)
SalePrice_data = housing_data.SalePrice
print(SalePrice_data.head()) # shows the first few entries of the column SalePrice

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
# selecting multiple columns using BRACKET notation
# get the column names into a list of strings
col_names = ['LandContour', 'HouseStyle']
two_cols_data = housing_data[col_names]
print(two_cols_data.describe())

       LandContour HouseStyle
count         1460       1460
unique           4          8
top            Lvl     1Story
freq          1311        726


## Creating and Fitting a Model

In [6]:
from sklearn.tree import DecisionTreeRegressor

# by convention, output variable is y
y = housing_data.SalePrice
# by convention, predictors are X
SalePricePredictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = housing_data[SalePricePredictors]

# Decision trees require making decisions and then make a prediction aka "leaf"
# Define model
SalePriceModel = DecisionTreeRegressor()

# Fit model
SalePriceModel.fit(X, y)

print("Making predictions for the following 5 houses: ")
print(X.head())
print("Predicted prices are: ")
print(SalePriceModel.predict(X.head()))

Making predictions for the following 5 houses: 
   LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0     8450       2003       856       854         2             3   
1     9600       1976      1262         0         2             3   
2    11250       2001       920       866         2             3   
3     9550       1915       961       756         1             3   
4    14260       2000      1145      1053         2             4   

   TotRmsAbvGrd  
0             8  
1             6  
2             6  
3             7  
4             9  
Predicted prices are: 
[ 208500.  181500.  223500.  140000.  250000.]


## Model Validation

In [7]:
# Most relevant is predictive accuracy
# Mean absolute error - taking the average of the absolute value of the errors found by comparing the predicted
# values and the actual values
from sklearn.metrics import mean_absolute_error
predicted_sale_prices = SalePriceModel.predict(X)
mean_absolute_error(y, predicted_sale_prices)
# In-sample scoring/validation --> overfitting of a model, relationships may just occur by chance in the dataset
# but may not necessarily be observed in some new data that you would actually predict for

# The most straightforward way to do this would be to measure performance on data that wasn't built into the model,
# i.e. using validation data
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
SalePriceModel = DecisionTreeRegressor()
SalePriceModel.fit(train_X, train_y)

val_predictions = SalePriceModel.predict(val_X)
print(mean_absolute_error(val_predictions, val_y))

33022.2465753


## Different Types of Models, Overfitting, and Underfitting

In [8]:
# For a decision tree, a tree's depth is a measure of how many splits it makes before making a prediction
# Having a ton of splits such that there are only a few houses in each leaf --> overfitting, i.e. the model
# will be good with predicting/validating in-sample values, but will not be good for any other data
# Underfitting is when there is not enough splitting, so the leaves contain houses of varying prices, which can lead
# to wildly inaccurate predictions, even in training data.

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# obtain MAE scores from different values for max_leaf_nodes
def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

# clearly 50 gives the least MAE

Max leaf nodes: 5 		 Mean Absolute Error: 35190
Max leaf nodes: 50 		 Mean Absolute Error: 27825
Max leaf nodes: 500 		 Mean Absolute Error: 32662
Max leaf nodes: 5000 		 Mean Absolute Error: 33382


## Random Forests

In [12]:
# A deep tree with lots of leaves will overfit because each prediction is coming from historical data from only a few
# houses at its leaf. But a shallow tree with few leaves will perform poorly because it fails to capture as many 
# distinctions in the raw data.
# Random forests make many random trees and makes a prediction by averaging the predictions of each component tree.
# Usually works with better predictive accuracy than just a single decision tree and it works well with 
# default parameters
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
housing_preds = forest_model.predict(val_X)
print("MAE is: %f" % mean_absolute_error(housing_preds, val_y))

MAE is: 23478.420000


In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)