In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Decision Tree Regression v2

In this notebook the realization of the second version of the Decision Tree Regressor will be explained.

### Importing data

All the training and test data is to be imported.

In [None]:
df_train = pd.read_pickle(r"../input/train.pkl")
X_train = df_train.drop(["date", "count", "Days from epoch"], axis=1)
y_train = df_train["count"]

df_train.drop(["Days from epoch"], axis=1, inplace=True)
df_train.head()

In [None]:
df_test = pd.read_pickle(r"../input/test.pkl")
X_test = df_test.drop(["date", "count", "Days from epoch"], axis=1)
y_test = df_test["count"]
df_test.head()

### Searching for correlation

Hereafter any possible correlation between a feature and the count will be found.

In [None]:
df_train.corr()['count']

In [None]:
# Select features with more than x positive or negative correlation
x = 0.18
corr = df_train.corr()[["count"]]
cols = corr[corr["count"].abs() > x].index.values[1:]
cols

Get all the possible combinations of the selected features per correlated columns

In [None]:
import itertools

all_subsets = list()

for L in range(1, len(cols)+1):
    for subset in itertools.combinations(cols, L):
        all_subsets.append(list(subset))

all_subsets

### Fit and predict in the search for the best combination

In [None]:
results = list()

for subset in all_subsets:
    
    clf = DecisionTreeRegressor()
    clf.fit(X_train[subset], y_train)
    
    y_pred = clf.predict(X_test[subset])
    score = mean_squared_error(y_test, y_pred)**0.5
    
    results.append((subset, score))

In [None]:
sorted_by_second = sorted(results, key=lambda tup: tup[1])[:5]

for sort in sorted_by_second:
    print(f"Features:\n{sort[0]}\nScore:\n{sort[1]}\n")

Select the best features

In [None]:
best_features = sorted_by_second[0][0]
best_features

In [None]:
clf = DecisionTreeRegressor()
clf.fit(X_train[best_features], y_train)

### Using the validation set for predictions

In [None]:
df_validation = pd.read_pickle(r"../input/validation.pkl")
df_validation.head(2)

In [None]:
X_validate = df_validation.drop(["date", "Predicted"], axis=1)

In [None]:
y_validate = clf.predict(X_validate[best_features])
df_validation["Predicted"] = y_validate
df_validation.head()

Plotting the predictions against the actual values.

In [None]:
df_test["count"].plot(figsize=(14,7), label="real value")
df_validation["Predicted"].plot()

plt.legend()
plt.show()

### Writing validation data to .csv file

In [None]:
df_validation.rename(columns= {"date" : "id"}, inplace=True)
df_validation["id"] = df_validation["id"].dt.strftime("%Y%m%d")
df_validation[["id", "Predicted"]].to_csv("../output/DTR_train.csv", index=False)

### Training regressor
Retraining the aforementioned Decision Tree Regressor with train and test data.

In [None]:
regressor = DecisionTreeRegressor()
regressor.fit(X_train[best_features].append(X_test[best_features]), y_train.append(y_test))

### Testing regressor

In [None]:
y_pred = regressor.predict(X_test[best_features])
mean_squared_error(y_test, y_pred)**0.5

In [None]:
regressor

### Validating regressor
Predict the data for the dates in validation.pkl, enter data into kaggle competition. 

In [None]:
df_validation = pd.read_pickle(r"../input/validation.pkl")
df_validation.head()

In [None]:
X_validate = df_validation.drop(["date", "Predicted"], axis=1)

In [None]:
y_validate = regressor.predict(X_validate[best_features])
df_validation["Predicted"] = y_validate
df_validation.head()

In [None]:
df_test["count"].plot(figsize=(14,7), label="real value")
df_validation["Predicted"].plot()

plt.legend()
plt.show()

### Writing validation data to .csv file

In [None]:
df_validation.rename(columns= {"date" : "id"}, inplace=True)
df_validation["id"] = df_validation["id"].dt.strftime("%Y%m%d")
df_validation[["id", "Predicted"]].to_csv("../output/DTR_train_test.csv", index=False)