<a href="https://colab.research.google.com/github/DavidBillayio/PythonMLtips/blob/master/RandomForestRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For this example we seek to predict the sale prices in the test data set. You will notice that the training data has the sale prices listed for a number of homes and the test data is missing the sale prices. 

Your job is to use the following code to predict the sale prices of the test data homes.

In [133]:
# A simple random forest regressor that is optimized for some parameters.

# import the modules

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
print("modules imported")

modules imported


In [134]:
#Read the data

full_train_data = pd.read_csv('full_train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Let's first look at the training data
print(full_train_data.head())

   area  bedrooms  bathrooms lot facing  saleprice
0  3836         2          2          N     398532
1   248         4          1          S     289967
2   496         2          1          E     236893
3  2295         3          1          W     309548
4   670         3          2          N     157009


In [135]:
#Notice that the Lot facing is in North, South, East and West. This can be more effectively interpreted through a One Hot encoder.

#first, and most importantly, we make a copy to avoid changing the original data
X_train = full_train_data.copy()

#import OneHot Encoder
from sklearn.preprocessing import OneHotEncoder

cols = ['lot facing']
OH_encoder = OneHotEncoder(sparse = False)
OH_train = pd.DataFrame(OH_encoder.fit_transform(X_train[cols]))
#notice the new data columns
print(OH_train)

       0    1    2    3
0    0.0  1.0  0.0  0.0
1    0.0  0.0  1.0  0.0
2    1.0  0.0  0.0  0.0
3    0.0  0.0  0.0  1.0
4    0.0  1.0  0.0  0.0
..   ...  ...  ...  ...
317  0.0  1.0  0.0  0.0
318  0.0  0.0  1.0  0.0
319  1.0  0.0  0.0  0.0
320  0.0  0.0  0.0  1.0
321  0.0  1.0  0.0  0.0

[322 rows x 4 columns]


In [136]:
#But how do we know which is which? By:
OH_encoder.categories_

[array(['E', 'N', 'S', 'W'], dtype=object)]

In [137]:
#So we create the new column headings for the One Hot Encoded columns
OH_train.columns =['E', 'N', 'S', 'W']


#just to double check:
print(OH_train)
print(X_train)
print(X_train['lot facing'].value_counts())
OH_train.sum()

       E    N    S    W
0    0.0  1.0  0.0  0.0
1    0.0  0.0  1.0  0.0
2    1.0  0.0  0.0  0.0
3    0.0  0.0  0.0  1.0
4    0.0  1.0  0.0  0.0
..   ...  ...  ...  ...
317  0.0  1.0  0.0  0.0
318  0.0  0.0  1.0  0.0
319  1.0  0.0  0.0  0.0
320  0.0  0.0  0.0  1.0
321  0.0  1.0  0.0  0.0

[322 rows x 4 columns]
     area  bedrooms  bathrooms lot facing  saleprice
0    3836         2          2          N     398532
1     248         4          1          S     289967
2     496         2          1          E     236893
3    2295         3          1          W     309548
4     670         3          2          N     157009
..    ...       ...        ...        ...        ...
317   459         2          0          N      67727
318  2890         3          1          S     184754
319  1592         0          0          E      30912
320  2852         4          3          W     340008
321  1546         1          1          N     209936

[322 rows x 5 columns]
N    82
S    80
W    80
E   

E    80.0
N    82.0
S    80.0
W    80.0
dtype: float64

In [141]:
#at a high level it checks out.
# We must now add the OH values to the dataframe

OH_X_train = pd.concat([X_train,OH_train], axis = 1)
print(OH_X_train.head())

   area  bedrooms  bathrooms lot facing  saleprice    E    N    S    W
0  3836         2          2          N     398532  0.0  1.0  0.0  0.0
1   248         4          1          S     289967  0.0  0.0  1.0  0.0
2   496         2          1          E     236893  1.0  0.0  0.0  0.0
3  2295         3          1          W     309548  0.0  0.0  0.0  1.0
4   670         3          2          N     157009  0.0  1.0  0.0  0.0
   area  bedrooms  bathrooms lot facing  saleprice    E    N    S    W
0  3836         2          2          N     398532  0.0  1.0  0.0  0.0
1   248         4          1          S     289967  0.0  0.0  1.0  0.0
2   496         2          1          E     236893  1.0  0.0  0.0  0.0
3  2295         3          1          W     309548  0.0  0.0  0.0  1.0
4   670         3          2          N     157009  0.0  1.0  0.0  0.0


In [142]:
#that was a lot of work, let's use something easier for the same result.

# We will re-copy our data for the second try

X_train2 = full_train_data.copy()

In [143]:

from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ('lot facing',        # Just a name
         OneHotEncoder(), # The transformer class
         [3]            # The column(s) to be applied on.
         )
    ], remainder='passthrough'
)
OH_X_train2 = pd.DataFrame(transformer.fit_transform(X_train2))
OH_X_train2.columns =['E', 'N', 'S', 'W', 'area', 'bedrooms', 'bathrooms', 'saleprice']
print(OH_X_train2)

       E    N    S    W    area  bedrooms  bathrooms  saleprice
0    0.0  1.0  0.0  0.0  3836.0       2.0        2.0   398532.0
1    0.0  0.0  1.0  0.0   248.0       4.0        1.0   289967.0
2    1.0  0.0  0.0  0.0   496.0       2.0        1.0   236893.0
3    0.0  0.0  0.0  1.0  2295.0       3.0        1.0   309548.0
4    0.0  1.0  0.0  0.0   670.0       3.0        2.0   157009.0
..   ...  ...  ...  ...     ...       ...        ...        ...
317  0.0  1.0  0.0  0.0   459.0       2.0        0.0    67727.0
318  0.0  0.0  1.0  0.0  2890.0       3.0        1.0   184754.0
319  1.0  0.0  0.0  0.0  1592.0       0.0        0.0    30912.0
320  0.0  0.0  0.0  1.0  2852.0       4.0        3.0   340008.0
321  0.0  1.0  0.0  0.0  1546.0       1.0        1.0   209936.0

[322 rows x 8 columns]


In [None]:
#that was much easier. We continue.

In [160]:
#Next, we define the target and features we will be using to predict the sale price

#define the target
y = OH_X_train2.saleprice

#define the features we are interested in using to predict
features = ['area', 'bedrooms', 'bathrooms', 'N', 'S', 'E', 'W']

#define the input features in a new dataframe
X = OH_X_train2[features].copy()
print(X)

       area  bedrooms  bathrooms    N    S    E    W
0    3836.0       2.0        2.0  1.0  0.0  0.0  0.0
1     248.0       4.0        1.0  0.0  1.0  0.0  0.0
2     496.0       2.0        1.0  0.0  0.0  1.0  0.0
3    2295.0       3.0        1.0  0.0  0.0  0.0  1.0
4     670.0       3.0        2.0  1.0  0.0  0.0  0.0
..      ...       ...        ...  ...  ...  ...  ...
317   459.0       2.0        0.0  1.0  0.0  0.0  0.0
318  2890.0       3.0        1.0  0.0  1.0  0.0  0.0
319  1592.0       0.0        0.0  0.0  0.0  1.0  0.0
320  2852.0       4.0        3.0  0.0  0.0  0.0  1.0
321  1546.0       1.0        1.0  1.0  0.0  0.0  0.0

[322 rows x 7 columns]


In [162]:
#Separate our training and validation sets from the test data

X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = 0)
print(X_train, X_valid, y_train, y_valid)

       area  bedrooms  bathrooms    N    S    E    W
173  2102.0       1.0        1.0  1.0  0.0  0.0  0.0
253   221.0       3.0        2.0  1.0  0.0  0.0  0.0
220  2743.0       1.0        0.0  0.0  0.0  0.0  1.0
196  3056.0       4.0        0.0  0.0  0.0  0.0  1.0
5    3384.0       2.0        3.0  0.0  1.0  0.0  0.0
..      ...       ...        ...  ...  ...  ...  ...
251  1606.0       3.0        2.0  0.0  0.0  1.0  0.0
192   853.0       2.0        1.0  0.0  0.0  0.0  1.0
117  2943.0       4.0        0.0  1.0  0.0  0.0  0.0
47   1724.0       4.0        1.0  0.0  0.0  1.0  0.0
172  1116.0       3.0        2.0  0.0  0.0  0.0  1.0

[257 rows x 7 columns]        area  bedrooms  bathrooms    N    S    E    W
293  3870.0       1.0        0.0  1.0  0.0  0.0  0.0
65   3599.0       1.0        1.0  1.0  0.0  0.0  0.0
15   1524.0       1.0        2.0  0.0  0.0  0.0  1.0
272   512.0       3.0        2.0  0.0  0.0  0.0  1.0
140   752.0       3.0        1.0  0.0  0.0  0.0  1.0
..      ...       ... 

In [168]:
#We will want to try several models using various parameters to see which model will work best

#Define several random forest regressors to compare.
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=10000, max_depth=4, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]
print('models loaded')

models loaded


In [157]:
# next we will define a function to score each model

def score_model(model, Xt, Xv, Yt, Yv):
  """takes in the model, the training and validation data and returns the mean absolute error"""
  model.fit(Xt,Yt)
  prediction = model.predict(Xv)
  return mean_absolute_error(Yv, prediction)

In [170]:
for i in range(0,len(models)):
  mae = score_model(models[i],X_train, X_valid, y_train, y_valid)
  print("Model {} MAE: {}".format(i+1, mae))

Model 1 MAE: 100597.60312820513
Model 2 MAE: 101180.13264102564
Model 3 MAE: 103688.85830769231
Model 4 MAE: 101713.16456757826
Model 5 MAE: 99861.66121613492


#Is the error good? It doesn't look like it, but we will take the model with the lowest error. What are the issues with this?

After all of that, where we doing again? That's right, predicting the test values.

In [172]:
#What do we need to do first?

#that's right
OH_test = pd.DataFrame(transformer.fit_transform(test_data))
OH_test.columns =['E', 'N', 'S', 'W', 'area', 'bedrooms', 'bathrooms']
print(OH_test)

      E    N    S    W    area  bedrooms  bathrooms
0   0.0  1.0  0.0  0.0  2811.0       2.0        0.0
1   0.0  0.0  1.0  0.0  1457.0       1.0        3.0
2   1.0  0.0  0.0  0.0  2152.0       3.0        0.0
3   0.0  0.0  0.0  1.0  1552.0       3.0        3.0
4   0.0  1.0  0.0  0.0  1940.0       2.0        4.0
5   0.0  0.0  1.0  0.0  1116.0       2.0        0.0
6   1.0  0.0  0.0  0.0   127.0       2.0        3.0
7   0.0  0.0  0.0  1.0  1615.0       0.0        0.0
8   0.0  1.0  0.0  0.0   337.0       2.0        0.0
9   0.0  0.0  1.0  0.0  1006.0       4.0        4.0
10  1.0  0.0  0.0  0.0  3106.0       2.0        2.0
11  0.0  0.0  0.0  1.0  1402.0       2.0        1.0
12  0.0  1.0  0.0  0.0  1263.0       4.0        2.0
13  0.0  0.0  1.0  0.0  3204.0       4.0        3.0
14  1.0  0.0  0.0  0.0  3949.0       2.0        2.0
15  0.0  0.0  0.0  1.0  1621.0       1.0        1.0
16  0.0  1.0  0.0  0.0  3971.0       0.0        1.0
17  0.0  1.0  0.0  0.0  3151.0       3.0        2.0
18  0.0  0.0

In [176]:
#initiate chosen model
chosen_model = model_5

chosen_model.fit(X,y)
prediction_test = chosen_model.predict(OH_test)
output_data = pd.DataFrame({'sale price' : prediction_test})
output = pd.concat([test_data,output_data], axis = 1)
output.to_csv('submission.csv', index=False)