# 0 Imports

In [44]:
import pickle
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

from matplotlib import pyplot as plt

from IPython.display        import Image
from IPython.core.display   import HTML

from sklearn.feature_selection  import RFE
from sklearn.linear_model       import LinearRegression
from sklearn.model_selection    import cross_val_score
from sklearn.metrics            import mean_squared_error
from lightgbm                   import LGBMRegressor

In [45]:
# Supressão da notação científica.
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', '{:.6f}'.format)

## 0.1 Funções Suporte

In [46]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [18,8]
    plt.rcParams['font.size'] = 14
    
    display( HTML('<style>.container { width: 100% !important;} </style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False)
    
    sns.set()
    
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [47]:
def abrir_arquivo()-> pd.DataFrame:
    with open(f"{__CAMINHO_INTERIM}{__NOME_ARQUIVO_IMPORTAR}","rb") as arquivo:
        df = pickle.load(arquivo)
    return df

In [77]:
def exportar_df(df: pd.DataFrame, nome_arquivo: str):
    df.to_pickle(f'{__CAMINHO_PROCESSED}{nome_arquivo}.pkl')

## 0.2 Load Data

In [79]:
__CAMINHO_INTERIM = '../data/interim/'
__CAMINHO_PROCESSED = '../data/processed/'

__NOME_ARQUIVO_IMPORTAR = '5_0_preparacao_dados.pkl'
#__NOME_ARQUIVO_EXPORTAR = '6_0_selecao_vars.pkl'

# 6.0 Seleção de Variáveis

In [50]:
df = abrir_arquivo()

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13268 entries, 46044 to 59311
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      13268 non-null  datetime64[ns]
 1   resource_guid             13268 non-null  float64       
 2   service_name              13268 non-null  float64       
 3   service_type              13268 non-null  float64       
 4   service_region            13268 non-null  float64       
 5   service_resource          13268 non-null  float64       
 6   quantity                  13268 non-null  float64       
 7   cost                      13268 non-null  float64       
 8   ano                       13268 non-null  int32         
 9   mes                       13268 non-null  int32         
 10  dia                       13268 non-null  int32         
 11  semana_ano                13268 non-null  UInt32        
 12  ano_semana         

## 6.1 Train Test

In [52]:
print(df['date'].max())
print(df['date'].min())

2024-10-30 00:00:00
2022-10-31 00:00:00


In [53]:
X_train = df[df['date'] <= '2024-09-30']
y_train = X_train['cost']
print(X_train['date'].max())
print(X_train['date'].min())


X_test = df[df['date'] > '2024-09-30']
y_test = X_test['cost']
print(X_test['date'].max())
print(X_test['date'].min()) 

2024-09-30 00:00:00
2022-10-31 00:00:00
2024-10-30 00:00:00
2024-10-01 00:00:00


In [54]:
X_train = X_train.drop(columns=['cost','date'])
X_test = X_test.drop(columns=['cost','date'])

## 6.2 Seleção com KNN

In [55]:
# Inicialize o KNN Regressor
lr = LinearRegression()

# Aplicar RFE para selecionar as melhores features
rfe = RFE(estimator=lr)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Treine o modelo e avalie
lr.fit(X_train_rfe, y_train)
y_pred = lr.predict(X_test_rfe)

print("MSE:", mean_squared_error(y_test, y_pred))
print("Features Selecionadas:", np.where(rfe.support_)[0])

MSE: 0.07599180437149725
Features Selecionadas: [ 0  1  2  3  4  5  6  7 11 14 15 18 19]


In [56]:
X_train.iloc[:,rfe.support_].columns.to_list()

['resource_guid',
 'service_name',
 'service_type',
 'service_region',
 'service_resource',
 'quantity',
 'ano',
 'mes',
 'ano_mes',
 'custo_fixo_diario',
 'custo_fixo_semanal',
 'custo_soma_movel_mensal',
 'mes_sen']

## 6.3 Seleção com LGBM

In [57]:
# Crie e treine o modelo LGBMClassifier
lgb_model = LGBMRegressor(n_jobs=-1, random_state=42)
X_train_sel = X_train
lgb_model.fit(X_train_sel, y_train)

# Obtenha as importâncias das características do modelo
importances = lgb_model.feature_importances_

# Ordene as características por importância decrescente
indices = np.argsort(importances)[::-1]

# Imprima o ranking das características
print('Feature ranking')
df_lgbm = pd.DataFrame()

for i, j in zip(X_train_sel.columns, importances):
    aux = pd.DataFrame({'feature': i, 'importance': j}, index=[0])
    df_lgbm = pd.concat([df_lgbm, aux], axis=0)

print(df_lgbm.sort_values('importance', ascending=False))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000896 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1592
[LightGBM] [Info] Number of data points in the train set: 12698, number of used features: 27
[LightGBM] [Info] Start training from score 0.204476
Feature ranking
                    feature  importance
0                  quantity         898
0         custo_fixo_diario         508
0             resource_guid         268
0          service_resource         248
0              service_type         221
0                ano_semana         166
0              service_name         147
0   custo_soma_movel_mensal          71
0        custo_fixo_semanal          66
0            service_region          60
0                semana_ano          57
0            semana_ano_cos          50
0  custo_soma_movel_semanal          38
0         custo_fi

## 6.4 Seleção com RandomForest

In [58]:
from sklearn.ensemble import RandomForestRegressor

In [59]:
print("5. Random Forest Feature Importance:")

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

rf_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

rf_importance

5. Random Forest Feature Importance:


Unnamed: 0,feature,importance
2,service_type,0.424044
4,service_resource,0.253409
5,quantity,0.246996
0,resource_guid,0.059187
14,custo_fixo_diario,0.005365
10,ano_semana,0.003737
1,service_name,0.002697
3,service_region,0.001009
11,ano_mes,0.000607
20,mes_cos,0.000463


## 6.5 Seleção com Boruta

In [60]:
from boruta import BorutaPy

In [61]:
rf = RandomForestRegressor()
boruta = BorutaPy( rf, n_estimators = 'auto', verbose=2, random_state=42).fit( X_train, y_train)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	12
Tentative: 	6
Rejected: 	9
Iteration: 	9 / 100
Confirmed: 	12
Tentative: 	6
Rejected: 	9
Iteration: 	10 / 100
Confirmed: 	12
Tentative: 	6
Rejected: 	9
Iteration: 	11 / 100
Confirmed: 	12
Tentative: 	6
Rejected: 	9
Iteration: 	12 / 100
Confirmed: 	12
Tentative: 	6
Rejected: 	9
Iteration: 	13 / 100
Confirmed: 	12
Tentative: 	5
Rejected: 	10
Iteration: 	14 / 100
Confirmed: 	12
Tentative: 	5
Rejected: 	10
Iteration: 	15 / 100
Confirmed: 	12
Tentative: 	5
Rejected: 	10
Iteration: 	16 / 100
Confirmed: 	12
Tentative: 	4
Rejected: 	

In [62]:
cols_selected = boruta.support_.tolist()

# best features
cols_selected_boruta = X_train.iloc[:, cols_selected].columns.to_list()

# not selected boruta
cols_not_selected_boruta = list( np.setdiff1d( X_train.columns, cols_selected_boruta ) )

In [63]:
cols_selected_boruta

['resource_guid',
 'service_name',
 'service_type',
 'service_region',
 'service_resource',
 'quantity',
 'semana_ano',
 'ano_semana',
 'ano_mes',
 'custo_fixo_diario',
 'custo_soma_movel_mensal',
 'mes_cos',
 'semana_ano_cos']

## 6.5 Filtro do Dataframe

In [64]:
selecao_de_colunas = [  'date',
                        'cost',
                        'service_name',
                        'service_type',
                        'service_region',
                        'quantity',
                        'mes',
                        'semana_ano',
                        'ano_semana',
                        'ano_mes',
                        'custo_fixo_diario',
                        'custo_soma_movel_semanal',
                        'custo_soma_movel_mensal',
                        'mes_cos',
                        'semana_ano_cos',
                        'dia',
                        'dia_cos'
                        ]

In [65]:
df.columns

Index(['date', 'resource_guid', 'service_name', 'service_type',
       'service_region', 'service_resource', 'quantity', 'cost', 'ano', 'mes',
       'dia', 'semana_ano', 'ano_semana', 'ano_mes', 'dia_da_semana',
       'fim_de_semana', 'custo_fixo_diario', 'custo_fixo_semanal',
       'custo_fixo_mensal', 'custo_soma_movel_semanal',
       'custo_soma_movel_mensal', 'mes_sen', 'mes_cos', 'dia_sen', 'dia_cos',
       'semana_ano_sen', 'semana_ano_cos', 'dia_da_semana_sen',
       'dia_da_semana_cos'],
      dtype='object')

In [66]:
df_filter = df.loc[:,selecao_de_colunas]

In [67]:
df_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13268 entries, 46044 to 59311
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      13268 non-null  datetime64[ns]
 1   cost                      13268 non-null  float64       
 2   service_name              13268 non-null  float64       
 3   service_type              13268 non-null  float64       
 4   service_region            13268 non-null  float64       
 5   quantity                  13268 non-null  float64       
 6   mes                       13268 non-null  int32         
 7   semana_ano                13268 non-null  UInt32        
 8   ano_semana                13268 non-null  int64         
 9   ano_mes                   13268 non-null  int64         
 10  custo_fixo_diario         13268 non-null  float64       
 11  custo_soma_movel_semanal  13268 non-null  float64       
 12  custo_soma_movel_me

In [70]:
print(df['date'].max())
print(df['date'].min())

2024-10-30 00:00:00
2022-10-31 00:00:00


### 6.5.1 Treino

In [73]:
X_train = df[df['date'] <= '2024-08-30']
y_train = X_train[['date','cost']]

print(X_train['date'].max())
print(X_train['date'].min())

X_train = X_train.drop(columns=['cost'])

2024-08-30 00:00:00
2022-10-31 00:00:00


### 6.5.2 Teste

In [75]:
X_test = df[(df['date'] > '2024-08-30') & (df['date'] <= '2024-09-30')]
y_test = X_test[['date','cost']]

print(X_test['date'].max())
print(X_test['date'].min()) 

X_test = X_test.drop(columns=['cost'])

2024-09-30 00:00:00
2024-08-31 00:00:00


### 6.5.3 Validação

In [76]:
X_val = df[(df['date'] > '2024-09-30')]
y_val = X_val[['date','cost']]

print(X_val['date'].max())
print(X_val['date'].min())

X_val = X_val.drop(columns=['cost'])

2024-10-30 00:00:00
2024-10-01 00:00:00


## 6.6 Export

In [78]:
exportar_df(X_train,'X_train')
exportar_df(y_train,'y_train')
exportar_df(X_test,'X_test')
exportar_df(y_test,'y_test')
exportar_df(X_val,'X_val')
exportar_df(y_val,'y_val')