In [1]:
#imported libraries
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
"""
Using the single "sample" to build model and evaluate it. Which can be seen in the output that predicted values from model and prediction target values (prices from data) are the same. The Mean Absolute Error (MAE) is relatively low, 62.35. This is "in-sample-score" which is not good for new data becase model might found a pattern in old data which is not in new data.
"""

In [2]:
#path to access file
file_path = '~/Desktop/Kaggle/practice/model_validation_home_train.csv'
#reads file
file_data = pd.read_csv(file_path)

#prediction target
y = file_data.SalePrice
#features that determines house price
featured_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = file_data[featured_columns]

#specify model
file_model = DecisionTreeRegressor(random_state=1)
#fit model
file_model.fit(X,y)

#predicred prices using model
predicted_prices_list1 = file_model.predict(X.head())
print("First in-sample predictions:", predicted_prices_list1)
#actual price from data set
actual_prices_list1 = y.head().tolist()
print("Actual target values homes:", actual_prices_list1)

#MAE
predicted_prices1 = file_model.predict(X)
mae1 = mean_absolute_error(y, predicted_prices1)
print('MAE:', mae1)

First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values homes: [208500, 181500, 223500, 140000, 250000]
MAE: 62.35433789954339


In [None]:
"""
Spliting data into two data sets. One is for trining model, other for validation. The predicted prices from model that used training data set are slightly different from validation data set (prices from data). MAE is 29652.93, which is 1/6 of the mean of validation value (176725.51). Thus, this model needs improvement or tjis problem need to use different kind of model (not DecissionTreeRegressor).
"""

In [3]:
#split data into training and validation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

#specify model
fit_model = DecisionTreeRegressor(random_state=1)

#fit model with trining data set
file_model.fit(train_X, train_y)

#predict validation prices based on model that used train data
valid_predict = file_model.predict(val_X)

#predicted price
predicted_prices_list2 = file_model.predict(val_X.head())
print("Validation prediction:", predicted_prices_list2)
#actual price
actual_prices_list2 = val_y.head().tolist()
print("Actual prices from validation data:", actual_prices_list2)

#MAE
predicted_prices2 = file_model.predict(val_X)
mae2 = mean_absolute_error(val_y, predicted_prices2)
print('MAE:', mae2)

#summary of target(validation) value
print(val_y.describe())

Validation prediction: [186500. 184000. 130000.  92000. 164500.]
Actual prices from validation data: [231500, 179500, 122000, 84500, 142000]
MAE: 29652.931506849316
count       365.000000
mean     176725.512329
std       82046.760890
min       40000.000000
25%      125500.000000
50%      160000.000000
75%      205000.000000
max      745000.000000
Name: SalePrice, dtype: float64
