In [1]:
import os
import sys
import numpy as np
import pandas as pd
import datetime
import time
import itertools
import pickle

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

path_main_folder = '/home/antorosi/Documents/Prediction'
sys.path.append(path_main_folder)

from conso.load_shape_data import load_data_conso, get_uniformed_data_conso, change_granularity, get_x_y_prediction_conso, get_train_test_sets, normalized_dataset, select_variables

### Load and shape data 

In [2]:
# Load
path_data = os.path.join(path_main_folder, 'data')
dict_data_conso = load_data_conso(path_data)

# Uniformization
data_conso_df, dict_colnames_conso = get_uniformed_data_conso(dict_data_conso)

# Granularity from 15 min to 1H
data_conso_df = change_granularity(data_conso_df, granularity="1H")

# Get x and y from prediction
x_conso, y_conso, dict_colnames_conso = get_x_y_prediction_conso(data_conso_df, dict_colnames_conso, lags=[24,48])

### Cross-validation parameters 

In [3]:
# folder to store results
path_out = os.path.join(path_main_folder, 'out', 'cv_cmca_model_rf_0')
if not os.path.exists(path_out):
    os.mkdir(path_out)

In [4]:
# variables used for input
selected_variables = ['conso', 'calendar', 'meteo']
gen_name = 'cmca'

In [5]:
# Test periods for each K step of the cross-validation
cv_periods = {}
cv_periods['period_1'] = (datetime.datetime(year=2013, month=1, day=1), datetime.datetime(year=2013, month=12, day=31))
cv_periods['period_2'] = (datetime.datetime(year=2014, month=1, day=1), datetime.datetime(year=2014, month=12, day=31))
cv_periods['period_3'] = (datetime.datetime(year=2015, month=1, day=1), datetime.datetime(year=2015, month=12, day=31))
cv_periods['period_4'] = (datetime.datetime(year=2016, month=1, day=1), datetime.datetime(year=2016, month=12, day=31))
cv_periods['period_5'] = (datetime.datetime(year=2017, month=1, day=1), datetime.datetime(year=2017, month=12, day=31))


In [6]:
# Getting each datasets
dict_datasets = {}
for key, date_period in cv_periods.items():
    x_conso_selected_var = select_variables(x_conso, dict_colnames_conso, selected_variables)
    dataset, dict_ds = get_train_test_sets(x_conso_selected_var, y_conso, date_period[0], date_period[1])
    dataset = normalized_dataset(dataset, dict_colnames_conso)
    
    dict_datasets[key] = {'dataset': dataset, 'dict_ds': dict_ds}

In [7]:
# Prepare results wrap up 
results_df = pd.DataFrame(columns=['name', 'train_mse',
                                           'train_mae', 'train_mape',
                                           'test_mse', 'test_mae',
                                           'test_mape'])
path_results = path_out

### Training

In [9]:
idx = 0

for name_period, el in dict_datasets.items():
    dataset = el['dataset']

    print('========================= Model {} ========================='.format(idx+1))

    # Prepare model characteristics and folders
    name_model = '{}_RF_{}_norm'.format(name_period, gen_name)

    path_model = os.path.join(path_out, name_model)
    if not os.path.exists(path_model):
        os.mkdir(path_model)
    
    # Compile model
    model = linear_model.LinearRegression(n_jobs=-1)
    model = RandomForestRegressor(random_state=0, n_estimators=350, max_depth=50, min_samples_leaf=2,
                                  n_jobs=-1, max_features=1/3)
    
    # Train model
    model.fit(X=dataset['train']['x'], y=np.ravel(dataset['train']['y']))
    
    # Save model
    with open(os.path.join(path_model,'model.pickle'),'wb') as f:
        pickle.dump(model,f)

    # Get results
    y_train = np.ravel(dataset['train']['y'])
    y_hat_train = model.predict(dataset['train']['x'])
    y_test= np.ravel(dataset['test']['y'])
    y_hat_test = model.predict(dataset['test']['x'])
    
    result = {}
    result['name'] = name_model
    
    result['train_mse'] = mean_squared_error(y_train, y_hat_train)
    result['train_mae'] = mean_absolute_error(y_train, y_hat_train)
    result['train_mape'] = mean_absolute_percentage_error(y_train, y_hat_train)
    result['test_mse'] = mean_squared_error(y_test, y_hat_test)
    result['test_mae'] = mean_absolute_error(y_test, y_hat_test)
    result['test_mape'] = mean_absolute_percentage_error(y_test, y_hat_test)

    # Append result to results_df
    
    results_df= results_df.append(result, ignore_index=True)
    results_df.to_csv(os.path.join(path_results, 'cv_results.csv'), sep=';')

    idx += 1




In [8]:
with open(os.path.join(path_out, 'dict_datasets.pickle'),'wb') as f:
    pickle.dump(dict_datasets, f)

In [8]:
date_period = [datetime.datetime(2013,1,1), datetime.datetime(2017,12,31)]

x_conso_selected_var = select_variables(x_conso, dict_colnames_conso, selected_variables)
dataset, dict_ds = get_train_test_sets(x_conso_selected_var, y_conso, date_period[0], date_period[1])
dataset = normalized_dataset(dataset, dict_colnames_conso)

dataset = dataset['test']
dict_ds=dict_ds['test']

x = dataset['x']
y = dataset['y']

In [9]:
train_indices = list()
test_indices = list()

for year in [2013,2014,2015,2016,2017]:
    mask = dict_ds.dt.year == year
    test_indices.append(np.where(mask)[0])
    train_indices.append(np.where(np.invert(mask))[0])

custom_cv = zip(train_indices, test_indices)

In [10]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 400, num = 8)]
# Number of features to consider at every split
max_features = ['sqrt', 'auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10, 15, 20]
# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [14]:
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(scoring='neg_mean_squared_error', estimator = rf, param_distributions = random_grid, n_iter = 1, cv = custom_cv, verbose=2, random_state=44, n_jobs = 6)

# Fit the random search model
rf_random.fit(X=x, y=np.ravel(y))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt 
[CV] min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt 
[CV] min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt 
[CV] min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt 
[CV] min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt 
[CV]  min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt, total=  54.2s
[CV]  min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt, total=  55.2s
[CV]  min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt, total=  55.3s


[Parallel(n_jobs=6)]: Done   3 out of   5 | elapsed:   58.2s remaining:   38.8s


[CV]  min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt, total= 1.1min
[CV]  min_samples_leaf=2, min_samples_split=5, n_estimators=200, max_depth=100, max_features=sqrt, total= 1.1min


[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:  1.2min finished


RandomizedSearchCV(cv=<zip object at 0x7f1918535f48>, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=1, n_jobs=6,
          param_distributions={'min_samples_leaf': [1, 2, 4, 6, 8, 10, 15, 20], 'min_samples_split': [2, 5, 10], 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110], 'max_features': ['sqrt', 'auto']},
          pre_dispatch='2*n_jobs', random_state=44, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=2)

In [21]:
with open(os.path.join(path_out,'rf_random.pickle'), 'wb') as f:
    pickle.dump(rf_random,f)

with open(os.path.join(path_out,'rf_random.pickle'), 'rb') as f:
    rf_random1 = pickle.load(f)

In [17]:
with open(os.path.join(path_out,'results.pickle'), 'wb') as f:
    pickle.dump(rf_random.cv_results_,f)

In [18]:
with open(os.path.join(path_out,'results.pickle'), 'rb') as f:
    results = pickle.load(f)

In [16]:
rf_random.cv_results_



{'mean_fit_time': array([59.80345397]),
 'mean_score_time': array([0.61167545]),
 'mean_test_score': array([-8018984.22173199]),
 'mean_train_score': array([-818715.59532868]),
 'param_max_depth': masked_array(data=[100],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=['sqrt'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[2],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[5],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[200],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 100,
   'max_features': 'sqrt',
   'min_samples_leaf': 2,
   'min_samples_split': 5,
   'n_estimators': 200}],
 'rank_test_score': array([1], dtype=int3

In [8]:
rf_random.best_params_

NameError: name 'rf_random' is not defined

In [79]:
rf_random.best_score_

0.9790588560688929

In [89]:
rf.fit(x,np.ravel(y))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [91]:
result = rf.predict(x)

In [92]:
result.shape

(35108,)

In [94]:
result.flatten().shape

(35108,)

In [95]:
y.shape

(35108, 1)

In [100]:
result.reshape(-1,1).flatten().shape

(35108,)