In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

X = pd.read_csv('data/data-SNA1.csv')
y = X.pop('Operatieduur').values

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X, y)

importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

predicted = model.predict(X)
print('\nR2:', r2_score(y, predicted))
print('EVS:', explained_variance_score(y, predicted))
print('MAE:', mean_absolute_error(y, predicted))
print('MSE:', mean_squared_error(y, predicted))

# # Print the feature ranking
# print("Feature ranking:")

# for f in range(10):
#     print("%d. %s (%f) (%f)" % (f + 1, X.columns[indices[f]], importances[indices[f]], std[indices[f]]))

# print(np.mean(np.absolute(predicted - y)))

# # Plot the feature importances of the forest
# plt.figure(figsize=[8.0,8.0])
# plt.ylabel('Feature importance')
# plt.xlabel('Feature')
# plt.bar(indices, importances[indices],
#        color="r", yerr=std[indices], align="center")
# plt.xticks(range(X.shape[1]), X.columns[indices])
# plt.xlim([-1, X.shape[1]])
# plt.show()


# plt.figure(figsize=[8.0,8.0])
# plt.xlabel('Actual')
# plt.ylabel('Predicted')
# plt.scatter(y, predicted)
# plt.plot(y, y, color='r')
# plt.grid(True)
# plt.show()

# plt.figure(figsize=[8.0,8.0])
# plt.xlabel('Actual')
# plt.ylabel('Predicted - Actual')
# plt.scatter(y, predicted - y)
# plt.grid(True)
# plt.show()

score = np.mean(cross_val_score(model, X, y, cv=10, scoring='r2'))
print("\ncross validation score: ")
print(score)
print("\n")