In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
import joblib
from datetime import datetime
import concurrent

BASE_DIR = "../"
PASQUINIS_PATH = BASE_DIR + "traces-netsoft-2017"
DATE = datetime.now().isoformat(timespec='seconds')
BASE_RESULTS_PATH = f'./resultados_pre_pesquisa/{DATE}'
MODELS_DIR = "models/"
MODELS_PATH_PREFIX = f'models/{DATE}_'

TRACES=[
    #"KV-BothApps-FlashcrowdLoad",
    #"KV-BothApps-PeriodicLoad",
    #"KV-SingleApp-FlashcrowdLoad",
    #"KV-SingleApp-PeriodicLoad",
    #"VoD-BothApps-FlashcrowdLoad",
    #"VoD-BothApps-PeriodicLoad",
    #"VoD-SingleApp-FlashcrowdLoad",
    "VoD-SingleApp-PeriodicLoad"]

NROWS = None
PERSIST = False
Y_METRIC = 'DispFrames'
VOD_SINGLEAPP_PERIODIC_LOAD_PATH = f'{PASQUINIS_PATH}/VoD-SingleApp-PeriodicLoad'

def nmae(y_pred, y_test):
    return abs(y_pred - y_test).mean() / y_test.mean()

if PERSIST:
    results_path = f'{BASE_RESULTS_PATH}'
    try:
        os.makedirs(results_path)
        os.makedirs(MODELS_DIR)
    except FileExistsError:
        print("Já criado diretório...")

In [3]:
y_dataset = pd.read_csv(f'{VOD_SINGLEAPP_PERIODIC_LOAD_PATH}/Y.csv', header=0, nrows=NROWS, index_col=0, usecols=['TimeStamp', 'DispFrames'], low_memory=True).apply(pd.to_numeric, errors='coerce').fillna(0)

x_files = ['X_cluster.csv', 'X_flow.csv', 'X_port.csv']

x_trace = pd.DataFrame()

for x_file in x_files:
    read_dataset = pd.read_csv(f'{VOD_SINGLEAPP_PERIODIC_LOAD_PATH}/{x_file}', 
                              header=0, index_col=0, low_memory=True, nrows=NROWS).apply(pd.to_numeric, errors='coerce').fillna(0)
    if len(x_trace.columns) != 0:
        x_trace.merge(read_dataset, how="inner",
            on="TimeStamp", copy=False)
    else:
        x_trace = read_dataset



In [12]:
X_train, X_test, y_train, y_test = train_test_split(x_trace, y_dataset, test_size=0.7, random_state=42)


In [8]:
import time

In [21]:

regression_tree = DecisionTreeRegressor() 

tempo_reg_tree = time.time()
regression_tree.fit(X_train, y_train)
tempo_reg_tree = time.time() - tempo_reg_tree

if PERSIST:
    joblib.dump(regression_tree, f'{MODELS_PATH_PREFIX}_model_regression-tree.sav')

print(f'Training Time Reg Tree: {tempo_reg_tree}')
pred_reg_tree = regression_tree.predict(X_test)
print(f'NMAE: {nmae(pred_reg_tree, y_test["DispFrames"])}')

Training Time Reg Tree: 32.23722505569458
NMAE: 0.10112599697648961


In [25]:
regr_random_forest = RandomForestRegressor(n_estimators=120, random_state=42, n_jobs=-1)

tempo_random_forest = time.time()
regr_random_forest.fit(X_train, y_train)
tempo_random_forest = time.time() - tempo_random_forest

if PERSIST:
    joblib.dump(regr_random_forest, f'{MODELS_PATH_PREFIX}_model_random-forest.sav')

print(f'Training Time Random Forest: {tempo_random_forest}')
pred_random_forest = regr_random_forest.predict(X_test)


  return fit_method(estimator, *args, **kwargs)


Training Time Random Forest: 320.60004210472107


In [26]:
print(f'NMAE: {nmae(pred_random_forest, y_test["DispFrames"])}')

NMAE: 0.0934177251849149


In [None]:
def get_nmae_random_forest_regression_tree(X, Y) -> tuple:
    print("Splitando em testes")
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=42)
    
    print("Treinando Decision tree")
    regression_tree = DecisionTreeRegressor() # a classification or regression decision tree is used as a predictive model to draw conclusions about a set of observations. 
    regression_tree.fit(X_train, y_train)

    joblib.dump(regression_tree, f'{MODELS_PATH_PREFIX}_model_regression-tree.sav')

    print("Treinando random forest")
    regr_random_forest = RandomForestRegressor(n_estimators=120, random_state=42, n_jobs=-1)
    regr_random_forest.fit(X_train, y_train)

    joblib.dump(regr_random_forest, f'{MODELS_PATH_PREFIX}_model_random-forest.sav')

    y_random_forest = regr_random_forest.predict(X_test)
    y_reg_tree = regression_tree.predict(X_test)
    
    return (nmae(y_random_forest, y_test), nmae(y_reg_tree, y_test))


def get_correlated_columns(x_file, group_column, y_dataset, correlation, Y_METRIC=Y_METRIC):
    column_dataset = pd.read_csv(f'{VOD_SINGLEAPP_PERIODIC_LOAD_PATH}/{x_file}', index_col=0, usecols=np.append(['TimeStamp'], group_column),nrows=NROWS).apply(pd.to_numeric, errors='coerce').fillna(0)
    for column in group_column:
        corr_to_y = abs(column_dataset[column].corr(y_dataset[Y_METRIC]))
        if corr_to_y > correlation:
            print(f'{column} abs corr to y {corr_to_y}, putting on x_trace')
        else:
            column_dataset = column_dataset.drop([column], axis=1)
    if column_dataset.isnull().values.any():
        print(column_dataset.columns.to_list())
    return column_dataset

