# Random Forest Regressors

Using the avocado.csv dataset to predict average_price

In [16]:
import pandas as pd
# read data from avacado.csv
file_path = "data/avocado.csv"
avocado_data = pd.read_csv(file_path)

## Preparing Data


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [18]:
avocado_data.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [19]:
avocado_data.to_numpy().shape

(18249, 14)

### Drop first column

In [20]:
avocado_data = avocado_data.drop(['Unnamed: 0'], axis = 1)

### Add separate 'day' and 'month' columns

In [21]:
avocado_data["Date"] = pd.to_datetime(avocado_data["Date"])
avocado_data["day"] = avocado_data["Date"].map(lambda x: x.day)
avocado_data["month"] = avocado_data["Date"].map(lambda x: x.month)
#avocado_data.head()

### Choose features

In [22]:
features = ['Total Volume', '4046', '4225', '4770', 
            'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 
            'type',
            'year', 'region', 'day', 'month']

### Set y and x

In [23]:
# predict AveragePrice
y = avocado_data.AveragePrice
# set x
x = avocado_data[features]

In [24]:
# Divide data
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state = 0)

In [25]:
x_valid.shape

(4563, 13)

In [26]:
x_train.shape

(13686, 13)

### Use label encoding for object columns

In [27]:
# Do label encoding for object columns
s = (x_train.dtypes == 'object')
object_cols = list(s[s].index)

label_encoder = LabelEncoder()
for col in object_cols:
    x_train[col] = label_encoder.fit_transform(x_train[col])
    x_valid[col] = label_encoder.transform(x_valid[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
x_train.head()

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,day,month
3291,266719.89,5370.62,122058.81,1567.4,137723.06,29123.32,108519.74,80.0,0,2016,9,11,9
313,4695737.21,1676601.43,1543280.76,266689.82,1209165.2,1061703.58,136747.1,10714.52,0,2015,6,20,12
15130,9801.89,66.38,4585.79,175.49,4974.23,4970.9,3.33,0.0,1,2017,7,27,8
7238,393289.53,10135.32,314676.72,2188.89,66288.6,34182.37,32016.23,90.0,0,2017,30,21,5
1595,311879.23,1960.76,253562.76,79.62,56276.09,56259.03,17.06,0.0,0,2015,30,26,4


### Create a standardized copy of data

In [29]:
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_valid_scaled = scaler.fit_transform(x_valid)
x_train_scaled[0]

array([-0.16770851, -0.2259551 , -0.14188446, -0.20169969, -0.10057488,
       -0.20429623,  0.23099488, -0.17159337, -0.99271976, -0.155843  ,
       -1.12015016, -0.53506182,  0.81015842])

### Create a normalized copy of data

In [30]:
x_train_norm = preprocessing.normalize(x_train, norm = 'l1')
x_valid_norm = preprocessing.normalize(x_valid, norm = 'l1')
x_train_norm[0]

array([3.96192489e-01, 7.97765516e-03, 1.81309252e-01, 2.32825571e-03,
       2.04577326e-01, 4.32605182e-02, 1.61197974e-01, 1.18834029e-04,
       0.00000000e+00, 2.99461753e-03, 1.33688283e-05, 1.63396790e-05,
       1.33688283e-05])

## Use Random Forest Regressor


In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import max_error

### Using data that isn't normalized/scaled

In [33]:
model = RandomForestRegressor(random_state = 0)
model.fit(x_train, y_train)
predicted_avgPrices = model.predict(x_valid)

