In [1]:
import os
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import  OneHotEncoder
from sklearn.compose import  ColumnTransformer

In [2]:
data = pd.read_csv('..\\Datasets\\home-data-for-ml-course\\train.csv')
test = pd.read_csv('..\\Datasets\\home-data-for-ml-course\\test.csv')

In [3]:
y = data.SalePrice
X = data.drop('SalePrice', axis=1)

In [4]:
# split the training into training and validation set
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [5]:
# categorical columns to work with
cat_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() < 6]

# Numerical columns to work with
num_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

# join the columns together
cols = cat_cols + num_cols

In [6]:
X_train = X_train_full[cols]
X_valid = X_valid_full[cols]

In [7]:
# categorical transformer
cat_trans = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
# quantitative transformer
num_trans = SimpleImputer(strategy='median')

In [8]:
# join the categorical and numerical transformer together
preprocess = ColumnTransformer(transformers=[('num', num_trans, num_cols), ('cat', cat_trans, cat_cols)])

In [9]:
from sklearn.tree import  DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

In [10]:
forest_model = RandomForestRegressor(random_state=0)
tree_model = DecisionTreeRegressor(random_state=0)

In [11]:
pipe = Pipeline(steps=[('preprocessor', preprocess)])

In [12]:
forest_pipeline = Pipeline(steps=[('preprocessor', preprocess), ('model', forest_model)])

In [13]:
tree_pipeline = Pipeline(steps=[('preprocessor', preprocess), ('model', tree_model)])

In [14]:
forest_pipeline.fit(X_train, y_train);
tree_pipeline.fit(X_train, y_train);

In [15]:
forest_preds = forest_pipeline.predict(X_valid)
tree_preds = tree_pipeline.predict(X_valid)

In [16]:
X_trn = pipe.fit_transform(X_train)
X_val = pipe.transform(X_valid)

In [17]:
forest_param_grid = [
{'n_estimators': [100, 150, 200, 350, 500], 'max_features': [8, 10, 20, 30, 40]}]

forest_grid_search = GridSearchCV(forest_model, forest_param_grid, cv=5,
scoring='neg_mean_absolute_error', return_train_score=True)

forest_grid_search.fit(X_trn, y_train);

In [18]:
forest_grid_search.best_params_

{'max_features': 40, 'n_estimators': 500}

## Putting higher num of n_estimator can result in Overfitting. but let's just go with that

In [19]:
# use the best parameters it suggest
forest_reg_model = RandomForestRegressor(max_features= 40, n_estimators = 500, random_state=0)

In [20]:
forest_reg_model.fit(X_trn, y_train)

RandomForestRegressor(max_features=40, n_estimators=500, random_state=0)

In [21]:
forest_reg_preds = forest_reg_model.predict(X_val)

In [22]:
tree_param_grid = [
{'min_samples_split': [2, 4, 6], 'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'], 'splitter': ['best', 'random'], 
'max_leaf_nodes': [5, 10, 15, 20, 25, 30, 50, 100, 300, 500]}
]

tree_grid_search = GridSearchCV(tree_model, tree_param_grid, cv=5,
scoring='neg_mean_absolute_error', return_train_score=True)

tree_grid_search.fit(X_trn, y_train);

In [23]:
tree_grid_search.best_params_

{'criterion': 'mae',
 'max_leaf_nodes': 30,
 'min_samples_split': 2,
 'splitter': 'best'}

In [24]:
# use the parameters it suggest
tree_reg_model = DecisionTreeRegressor(criterion='mse', max_leaf_nodes=30, min_samples_split=2, splitter='best', random_state=0)

In [25]:
tree_reg_model.fit(X_trn, y_train)

DecisionTreeRegressor(max_leaf_nodes=30, random_state=0)

In [26]:
tree_reg_preds = tree_reg_model.predict(X_val)

## Final result below. Before tunning and After tunning

In [27]:
print('MAE for forest before tunning (Default params):', mean_absolute_error(y_valid, forest_preds))
print('MAE for forest after tunning:', mean_absolute_error(y_valid, forest_reg_preds))
print()
print('MAE for tree before tunning (Default params):', mean_absolute_error(y_valid, tree_preds))
print('MAE for tree after tunning:', mean_absolute_error(y_valid, tree_reg_preds))

MAE for forest before tunning (Default params): 17882.278493150687
MAE for forest after tunning: 17321.373198630135

MAE for tree before tunning (Default params): 26028.130136986303
MAE for tree after tunning: 24157.303452818458


## We improved a bit after tunning the Random forest