# 0 Imports

In [43]:
import pickle
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

from matplotlib import pyplot as plt

from IPython.display        import Image
from IPython.core.display   import HTML

from sklearn.feature_selection  import RFE
from sklearn.linear_model       import LinearRegression
from sklearn.model_selection    import cross_val_score
from sklearn.metrics            import mean_squared_error
from lightgbm                   import LGBMRegressor

In [2]:
# Supressão da notação científica.
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', '{:.6f}'.format)

## 0.1 Funções Suporte

In [3]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [18,8]
    plt.rcParams['font.size'] = 14
    
    display( HTML('<style>.container { width: 100% !important;} </style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False)
    
    sns.set()
    
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [4]:
def abrir_arquivo()-> pd.DataFrame:
    with open(f"{__CAMINHO_INTERIM}{__NOME_ARQUIVO_IMPORTAR}","rb") as arquivo:
        df = pickle.load(arquivo)
    return df

In [5]:
def exportar_df(df: pd.DataFrame):
    df.to_pickle(f'{__CAMINHO_INTERIM}{__NOME_ARQUIVO_EXPORTAR}')

## 0.2 Load Data

In [6]:
__CAMINHO_RAW = '../data/raw/'
__CAMINHO_INTERIM = '../data/interim/'

__NOME_ARQUIVO_IMPORTAR = '5_0_preparacao_dados.pkl'
__NOME_ARQUIVO_EXPORTAR = '6_0_selecao_vars.pkl'

# 6.0 Seleção de Variáveis

In [26]:
df = abrir_arquivo()

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6947 entries, 52365 to 59311
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      6947 non-null   datetime64[ns]
 1   resource_guid             6947 non-null   float64       
 2   service_name              6947 non-null   float64       
 3   service_type              6947 non-null   float64       
 4   service_region            6947 non-null   float64       
 5   service_resource          6947 non-null   float64       
 6   quantity                  6947 non-null   float64       
 7   cost                      6947 non-null   float64       
 8   ano                       6947 non-null   int32         
 9   mes                       6947 non-null   int32         
 10  dia                       6947 non-null   int32         
 11  semana_ano                6947 non-null   UInt32        
 12  ano_semana          

## 6.1 Train Test

In [28]:
print(df['date'].max())
print(df['date'].min())

2024-10-30 00:00:00
2023-10-31 00:00:00


In [29]:
X_train = df[df['date'] <= '2024-09-30']
y_train = X_train['cost']
print(X_train['date'].max())
print(X_train['date'].min())


X_test = df[df['date'] > '2024-09-30']
y_test = X_test['cost']
print(X_test['date'].max())
print(X_test['date'].min()) 

2024-09-30 00:00:00
2023-10-31 00:00:00
2024-10-30 00:00:00
2024-10-01 00:00:00


In [30]:
X_train = X_train.drop(columns=['cost','date'])
X_test = X_test.drop(columns=['cost','date'])

In [36]:
# Inicialize o KNN Regressor
lr = LinearRegression()

# Aplicar RFE para selecionar as melhores features
rfe = RFE(estimator=lr)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Treine o modelo e avalie
lr.fit(X_train_rfe, y_train)
y_pred = lr.predict(X_test_rfe)

print("MSE:", mean_squared_error(y_test, y_pred))
print("Features Selecionadas:", np.where(rfe.support_)[0])

MSE: 0.07491966941452886
Features Selecionadas: [ 0  1  2  3  4  5  6  7  9 10 11 14 15]


In [44]:
# Crie e treine o modelo LGBMClassifier
lgb_model = LGBMRegressor(n_jobs=-1, random_state=42)
X_train_sel = X_train
lgb_model.fit(X_train_sel, y_train)

# Obtenha as importâncias das características do modelo
importances = lgb_model.feature_importances_

# Ordene as características por importância decrescente
indices = np.argsort(importances)[::-1]

# Imprima o ranking das características
print('Feature ranking')
df = pd.DataFrame()

for i, j in zip(X_train_sel.columns, importances):
    aux = pd.DataFrame({'feature': i, 'importance': j}, index=[0])
    df = pd.concat([df, aux], axis=0)

print(df.sort_values('importance', ascending=False))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000955 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1443
[LightGBM] [Info] Number of data points in the train set: 6377, number of used features: 27
[LightGBM] [Info] Start training from score 0.210779
Feature ranking
                    feature  importance
0                  quantity        1199
0         custo_fixo_diario         377
0              service_name         323
0  custo_soma_movel_semanal         220
0            service_region         174
0   custo_soma_movel_mensal         162
0                       dia         107
0              service_type          93
0                semana_ano          81
0                ano_semana          66
0        custo_fixo_semanal          46
0                   dia_cos          45
0                   dia_sen          19
0                  

In [45]:
from sklearn.ensemble import RandomForestRegressor

In [46]:
print("\
5. Random Forest Feature Importance:")
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
rf_importance

5. Random Forest Feature Importance:


Unnamed: 0,feature,importance
3,service_region,0.423031
1,service_name,0.308153
5,quantity,0.212667
2,service_type,0.044361
14,custo_fixo_diario,0.010389
9,semana_ano,0.000635
24,semana_ano_cos,0.000242
7,mes,0.00014
10,ano_semana,8.7e-05
20,mes_cos,5.4e-05


In [48]:
from boruta import BorutaPy

In [49]:
rf = RandomForestRegressor()
boruta = BorutaPy( rf, n_estimators = 'auto', verbose=2, random_state=42).fit( X_train, y_train)