# Random Forest Regressor #

## 1 - Importing packages and data ##

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv("../datasets/advertising_and_sales_clean.csv")
df.head()

Unnamed: 0,tv,radio,social_media,influencer,sales
0,16000.0,6566.23,2907.98,Mega,54732.76
1,13000.0,9237.76,2409.57,Mega,46677.9
2,41000.0,15886.45,2913.41,Mega,150177.83
3,83000.0,30020.03,6922.3,Mega,298246.34
4,15000.0,8437.41,1406.0,Micro,56594.18


In [3]:
print(df.shape)

(4546, 5)


## 2 - Preparing data ##

### 2.1 - OneHot Encoding categorical variables ###

In [4]:
df_dummies = pd.get_dummies(df['influencer'], drop_first = True).astype('int')
df = pd.concat([df, df_dummies], axis = 1)
df = df.drop('influencer', axis = 1)

### 2.2 - Separating X and y ###

In [5]:
X = df.drop('sales', axis = 1).values
y = df['sales'].values

### 2.3 - Separating train and test samples ###

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## 3 - Grid Search ##

In [7]:
params_rf = {"max_depth": np.arange(2,12),
            "min_samples_leaf": [0.04, 0.06, 0.08, 0.1],
            "max_features" : ['log2', 'sqrt'],
            "n_estimators": [300, 400, 500]}

rf = RandomForestRegressor(random_state = 42)
grid_rf = GridSearchCV(estimator = rf, param_grid = params_rf, scoring = 'neg_mean_squared_error', cv = 10, verbose = 1, n_jobs = -1)

In [8]:
grid_rf.fit(X_train, y_train)
print(grid_rf.best_params_, grid_rf.best_score_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
{'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 0.04, 'n_estimators': 300} -540731205.4118304


In [9]:
best_model = grid_rf.best_estimator_
y_pred = best_model.predict(X_test)
print(mean_squared_error(y_test, y_pred))

495937112.2455533


## 4 - Scaled Grid Search ##

### 4.1 - Importing packages ###

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

### 4.2 - Scaling data ###

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
print(X_train_scaled)

[[ 1.60426873  1.60708424  2.44574963 -0.5845489   1.72824828 -0.57607997]
 [-1.22133784 -1.60538654 -0.24621884 -0.5845489  -0.57862057 -0.57607997]
 [-0.76313137  0.15326006  0.77539406 -0.5845489  -0.57862057 -0.57607997]
 ...
 [-0.7249475  -0.16447676 -0.17621924 -0.5845489  -0.57862057  1.73587011]
 [-0.61039588 -0.9458863  -1.10888607 -0.5845489  -0.57862057 -0.57607997]
 [ 0.45875256  1.13805146  1.70761508 -0.5845489   1.72824828 -0.57607997]]


### 4.3 - Grid setting ###

In [12]:
params_rf = {"max_depth": np.arange(2,12),
            "min_samples_leaf": [0.04, 0.06, 0.08, 0.1],
            "max_features" : ['log2', 'sqrt'],
            "n_estimators": [300, 400, 500]}

rf = RandomForestRegressor(random_state = 42)
grid_rf = GridSearchCV(estimator = rf, param_grid = params_rf, scoring = 'neg_mean_squared_error', cv = 10, verbose = 1, n_jobs = -1)

### 4.4 - Fitting and finding the best model ###

In [13]:
grid_rf.fit(X_train_scaled, y_train)
print(grid_rf.best_params_, grid_rf.best_score_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits


{'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 0.04, 'n_estimators': 300} -540727433.0877445


### 4.5 - Evaluating the best model ###

In [14]:
best_model = grid_rf.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print(mean_squared_error(y_test, y_pred))

495970004.96001434
