# Random Forest Regressor

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

### Importing data

In [None]:
df_train = pd.read_pickle(r"../input/train.pkl")
X_train = df_train.drop(["date", "count"], axis=1)
y_train = df_train["count"]
df_train.head()

In [None]:
df_test = pd.read_pickle(r"../input/test.pkl")
X_test = df_test.drop(["date", "count"], axis=1)
y_test = df_test["count"]
df_test.head()

### Hyperparameter tuning using Grid Search

In [None]:
parameters = {"min_samples_leaf" : list(range(1, 6)),
             "n_estimators" : list(range(80, 241, 20)),
             "max_features" : list(range(6, 15, 2))}
clf = GridSearchCV(RandomForestRegressor(), parameters, n_jobs=4, scoring="neg_mean_squared_error")
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

### training regressor

Retraining the Random Forest Regressor with both the train and test data

In [None]:
regressor = clf.best_estimator_
regressor.fit(X_test.append(X_train), y_test.append(y_train))

#### Testing regressor

The root-mean squared error of this model is 17,11. But this model is being tested on the both the test and train data, so this was as expected.

In [None]:
regressor.score(X_test, y_test)

In [None]:
y_pred = regressor.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

### Plotting the predictions

In [None]:
plt.plot(range(len(y_test)), y_test, label="real values")
plt.plot(range(len(y_test)), y_pred, label="predictions")

plt.legend()
plt.show()

### Validating regressor
Predict the data for the dates in validation.pkl, enter data into kaggle competition.

In [None]:
df_validation = pd.read_pickle(r"../input/validation.pkl")
df_validation.head()

In [None]:
X_validate = df_validation.drop(["date", "Predicted"], axis=1)

In [None]:
y_validate = regressor.predict(X_validate)
df_validation["Predicted"] = y_validate
df_validation.head()

### Writing validation data to .csv file

In [None]:
df_validation.rename(columns= {"date" : "id"}, inplace=True)
df_validation["id"] = df_validation["id"].dt.strftime("%Y%m%d")
df_validation[["id", "Predicted"]].to_csv("../output/RFRval.csv", index=False)