In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error # Model validation

iowa_file_path = '~/learningml/Kaggle/train.csv'
data = pd.read_csv(iowa_file_path)

y = data.SalePrice
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = data[feature_columns]

iowa_model = DecisionTreeRegressor()
iowa_model.fit(X, y)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=0)
model = DecisionTreeRegressor()
model.fit(Xtrain, ytrain)
model_predictions = model.predict(Xtest)

print(mean_absolute_error(ytest, model_predictions))

33288.2712329


In [3]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

In [4]:
for max_leaf_nodes in [5, 25, 50, 100, 250, 500]:
    my_mae = get_mae(max_leaf_nodes, Xtrain, Xtest, ytrain, ytest)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
    
best_tree_size = 100

final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size)
final_model.fit(Xtrain, ytrain)
final_preds_vals = final_model.predict(Xtest)
print(mean_absolute_error(ytest, final_preds_vals))

Max leaf nodes: 5  		 Mean Absolute Error:  35190
Max leaf nodes: 25  		 Mean Absolute Error:  28501
Max leaf nodes: 50  		 Mean Absolute Error:  27825
Max leaf nodes: 100  		 Mean Absolute Error:  28653
Max leaf nodes: 250  		 Mean Absolute Error:  31738
Max leaf nodes: 500  		 Mean Absolute Error:  32662
29172.7880104


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(Xtrain, ytrain)
forest_preds = forest_model.predict(Xtest)
print(mean_absolute_error(ytest, forest_preds))


24346.6200652


In [6]:
missing_val_count_by_column = data.isnull().sum()
#print(missing_val_count_by_column)
data_without_missing_vals = data.dropna(axis=1)
cols_with_missing = [col for col in data.columns if data[col].isnull().any()]
reduced_original_data = data.drop(cols_with_missing, axis=1)

In [7]:
#from sklearn.impute import SimpleImputer
#my_imputer = SimpleImputer()
#data_with_imputed_vals = my_imputer.fit_transform(data)

In [9]:
print(data.dtypes.sample(10))

MiscFeature     object
GrLivArea        int64
Street          object
GarageArea       int64
TotRmsAbvGrd     int64
BsmtFinType1    object
OverallCond      int64
GarageFinish    object
BsmtQual        object
BsmtFinType2    object
dtype: object


In [10]:
one_hot_encoded_training_predictors = pd.get_dummies(data)