# 0 Imports

In [1]:
import pickle
import datetime

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot

from IPython.display        import Image
from IPython.core.display   import HTML

In [2]:
# Supressão da notação científica.
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', '{:.6f}'.format)

## 0.1 Funções Suporte

In [3]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [12,6]
    plt.rcParams['font.size'] = 20
    
    display( HTML('<style>.container { width: 100% !important;} </style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False)
    
    sns.set()
    
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [4]:
def visualizacao_dados_categoricos(df):
    for i in df:
        print(f'Atributo: {i}') 
        print(f'Valores Únicos: {len(df[i].sort_values().unique())}')
        print(f'Valores Descritos: {df[i].sort_values().unique().tolist()}\n')

## 0.2 Load Data

In [5]:
with open("../data/interim/db_ajustado.pkl","rb") as arquivo:
    df = pickle.load(arquivo)
df.sample()

Unnamed: 0,subscription_name,subscription_guid,date,resource_guid,service_name,service_type,service_region,service_resource,quantity,cost,maquina,year,month,day,week_of_year,year_week,year_month
31435,Microsoft Azure Sponsorship,82375e17-0dda-4790-b6e8-c565d4f08e87,2023-11-21,42e2a22f-fabf-41ea-a21e-e86bb3ff5799,Storage,Standard Page Blob,All,Disk Read Operations,0.2884,9.6e-05,portal,2023,11,21,47,2023-47,2023-11


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56933 entries, 0 to 56932
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   subscription_name  56933 non-null  object        
 1   subscription_guid  56933 non-null  object        
 2   date               56933 non-null  datetime64[ns]
 3   resource_guid      56933 non-null  object        
 4   service_name       56933 non-null  object        
 5   service_type       56933 non-null  object        
 6   service_region     56933 non-null  object        
 7   service_resource   56933 non-null  object        
 8   quantity           56933 non-null  float64       
 9   cost               56933 non-null  float64       
 10  maquina            56933 non-null  object        
 11  year               56933 non-null  int32         
 12  month              56933 non-null  int32         
 13  day                56933 non-null  int32         
 14  week_o

## 0.3 Planejamento para criação de novas Features

# 3.0 F.E.

In [7]:
df.columns

Index(['subscription_name', 'subscription_guid', 'date', 'resource_guid',
       'service_name', 'service_type', 'service_region', 'service_resource',
       'quantity', 'cost', 'maquina', 'year', 'month', 'day', 'week_of_year',
       'year_week', 'year_month'],
      dtype='object')

In [8]:
colunas = ['service_name', 'service_type', 'service_region', 'service_resource','maquina']
colunas_categoricas = df.loc[:,colunas]

In [9]:
visualizacao_dados_categoricos(colunas_categoricas)

Atributo: service_name
Valores Únicos: 15
Valores Descritos: ['Automation', 'Azure App Service', 'Azure DNS', 'Azure Monitor', 'Backup', 'Bandwidth', 'Functions', 'Key Vault', 'Log Analytics', 'Logic Apps', 'Network Watcher', 'Storage', 'Virtual Machines', 'Virtual Machines Licenses', 'Virtual Network']

Atributo: service_type
Valores Únicos: 24
Valores Descritos: ['All', 'BS Series', 'BS Series Windows', 'Basv2 Series', 'Dv3/DSv3 Series', 'Files', 'Files v2', 'Free Plan', 'General Block Blob', 'IP Addresses', 'Inter-Region', 'Premium', 'Premium Page Blob', 'Premium SSD Managed Disks', 'Private Link', 'Process', 'Queues v2', 'SQL Server Azure Hybrid Benefit', 'SQL Server Developer Edition', 'Standard HDD Managed Disks', 'Standard Page Blob', 'Standard SSD Managed Disks', 'Tables', 'Tiered Block Blob']

Atributo: service_region
Valores Únicos: 7
Valores Descritos: ['All', 'BR South', 'Global', 'Intercontinental', 'North America', 'South America', 'US East']

Atributo: service_resource
V

In [10]:
df2 = df.copy()
df2 = df2.sort_values(by=['maquina', 'date']) #Garantir a sequencia ordenada de máquina e data

In [11]:
df2['dia_da_semana'] = df2['date'].dt.dayofweek.astype(int)

In [12]:
df2['fim_de_semana'] = df2['dia_da_semana'].apply(lambda x : 1 if x >= 5 else 0)

In [13]:
df2['custo_diario'] = df2.groupby(['maquina','date'])['cost'].transform('sum')

In [14]:
df2['custo_media_movel_semanal'] = df2.groupby(
                                    ['maquina']
                                    )['cost'].transform(
                                                lambda x: x.rolling(
                                                    window=7,
                                                    min_periods=1
                                                    ).sum()
                                                )

In [19]:
df2['custo_media_movel_semanal_teste'] = df2.groupby(
                                                    ['maquina']
                                                    )['cost'].transform(
                                                                lambda x: x.rolling(
                                                                    window=7,
                                                                    min_periods=1
                                                                    ).sum()
                                                                ).reset_index(
                                                                    level=0,
                                                                    drop=True
                                                                    )

In [47]:
df2['custo_media_movel_semanal_data'] = df2.groupby(
                                            ['maquina',
                                            'date'])['cost'].transform(
                                                        lambda x: x.rolling(
                                                            window=7,
                                                            min_periods=1
                                                            ).sum()
                                                        )

In [15]:
df2['custo_media_movel_mensal'] = df2.groupby(
                                    ['maquina']
                                    )['cost'].transform(
                                                lambda x: x.rolling(
                                                    window=30,
                                                    min_periods=1
                                                    ).sum()
                                                )

In [48]:
df2['custo_media_movel_mensal_date'] = df2.groupby(
                                            ['maquina',
                                            'date']
                                            )['cost'].transform(
                                                lambda x: x.rolling(
                                                    window=30,
                                                    min_periods=1
                                                    ).sum()
                                                )

In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56933 entries, 0 to 56272
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   subscription_name          56933 non-null  object        
 1   subscription_guid          56933 non-null  object        
 2   date                       56933 non-null  datetime64[ns]
 3   resource_guid              56933 non-null  object        
 4   service_name               56933 non-null  object        
 5   service_type               56933 non-null  object        
 6   service_region             56933 non-null  object        
 7   service_resource           56933 non-null  object        
 8   quantity                   56933 non-null  float64       
 9   cost                       56933 non-null  float64       
 10  maquina                    56933 non-null  object        
 11  year                       56933 non-null  int32         
 12  month    

In [20]:
aux= df2.loc[(df2['maquina']=='unicin') & (df2['date']=='2024-09-12'),
        ['date',
        'maquina',
        'cost',
        'custo_diario',
        'custo_media_movel_semanal',
        'custo_media_movel_semanal_teste']]
aux

Unnamed: 0,date,maquina,cost,custo_diario,custo_media_movel_semanal,custo_media_movel_semanal_teste
32106,2024-09-12,unicin,0.105192,5.485406,0.954624,0.170349
33048,2024-09-12,unicin,0.0,5.485406,0.714624,3.083728
35513,2024-09-12,unicin,0.010008,5.485406,0.575304,0.330571
36516,2024-09-12,unicin,0.281146,5.485406,0.69517,0.16332
37519,2024-09-12,unicin,0.18365,5.485406,0.58014,0.16332
38094,2024-09-12,unicin,0.0864,5.485406,0.666396,2.403238
39088,2024-09-12,unicin,0.0,5.485406,0.666396,0.877558
39995,2024-09-12,unicin,1.452,5.485406,2.013204,3.12925
41474,2024-09-12,unicin,0.098352,5.485406,2.111556,2.486642
43998,2024-09-12,unicin,0.0,5.485406,2.101548,0.002613


In [21]:
aux['cost'].sum()

5.485406

In [22]:
aux_df = df2.loc[(df2['maquina']=='unicin') & (df2['date']>='2024-09-12'),
        ['date',
        'maquina',
        'cost',
        'custo_media_movel_semanal',
        'custo_media_movel_mensal']
        ].groupby(['maquina','date']).sum().sort_values(by='date',ascending=False).reset_index()
aux_df

Unnamed: 0,maquina,date,cost,custo_media_movel_semanal,custo_media_movel_mensal
0,unicin,2024-10-11,4.242862,30.172204,154.598523
1,unicin,2024-10-10,5.6293,39.4051,169.039118
2,unicin,2024-10-09,5.64062,39.48434,168.661696
3,unicin,2024-10-08,5.600468,39.203276,164.710589
4,unicin,2024-10-07,5.365676,37.559732,160.256533
5,unicin,2024-10-06,5.314549,37.201843,159.765826
6,unicin,2024-10-05,5.361065,37.527455,158.739095
7,unicin,2024-10-04,5.23878,36.67146,158.476426
8,unicin,2024-10-03,5.283107,36.981749,240.680721
9,unicin,2024-10-02,11.152205,78.065435,252.981018


In [23]:
aux_df.loc[:,'cost'].sum()

167.62592

In [24]:
aux_df.loc[aux_df['date']>='2024-10-05',['cost']].sum()

cost   37.154540
dtype: float64

In [52]:
aux_df = df2.loc[(df2['maquina']=='unicin') & (df2['date']>='2024-09-12'),
        ['date',
        'maquina',
        'custo_diario',
        'custo_media_movel_semanal_data',
        'custo_media_movel_mensal_date']
        ].groupby(['maquina','date']).sum().sort_values(by='date',ascending=False).reset_index()

In [53]:
aux_df

Unnamed: 0,maquina,date,custo_diario,custo_media_movel_semanal_data,custo_media_movel_mensal_date
0,unicin,2024-10-11,80.614378,28.002556,39.546997
1,unicin,2024-10-10,106.9567,37.235452,53.827468
2,unicin,2024-10-09,107.17178,37.314692,54.006953
3,unicin,2024-10-08,106.408892,37.033628,53.3593
4,unicin,2024-10-07,101.947844,35.390084,49.618988
5,unicin,2024-10-06,100.976431,35.032195,48.798934
6,unicin,2024-10-05,101.860235,35.357807,49.547159
7,unicin,2024-10-04,99.53682,34.501812,48.289354
8,unicin,2024-10-03,100.379033,34.812101,48.306138
9,unicin,2024-10-02,211.891895,75.895787,142.191567


In [58]:
df2.sample(5)

Unnamed: 0,subscription_name,subscription_guid,date,resource_guid,service_name,service_type,service_region,service_resource,quantity,cost,maquina,year,month,day,week_of_year,year_week,year_month,dia_da_semana,custo_diario,custo_media_movel_semanal,custo_media_movel_mensal,fim_de_semana,custo_media_movel_semanal_data,custo_media_movel_mensal_date
22811,Microsoft Azure Sponsorship,0b1856f5-ffb1-4fb2-9b31-ebc3ddeacf68,2024-03-03,3f2b1e1c-c886-4ec6-ad6f-dd0ef38819c9,Storage,Tables,All,LRS Data Stored,0.000144,0.0,ceaec,2024,3,3,9,2024-09,2024-03,6,6.367541,0.0,0.0,1,1.980491,6.367517
17908,Microsoft Azure Sponsorship,0b1856f5-ffb1-4fb2-9b31-ebc3ddeacf68,2023-07-09,a97f31a2-0fb1-4d64-bb43-a2399dd31f0b,Storage,Standard SSD Managed Disks,US East,E10 LRS Disk,0.032256,0.309648,ceaec,2023,7,9,27,2023-27,2023-07,6,9.149612,2.167536,9.507168,1,2.347032,7.318307
18252,Microsoft Azure Sponsorship,0b1856f5-ffb1-4fb2-9b31-ebc3ddeacf68,2022-04-06,28339581-90e9-4bde-93d2-a61b4f2a0b5e,Storage,Standard SSD Managed Disks,BR South,E10 LRS Disk,0.100008,2.073744,ceaec,2022,4,6,14,2022-14,2022-04,2,11.592327,14.449032,60.683703,0,2.624848,11.494959
31258,Microsoft Azure Sponsorship,82375e17-0dda-4790-b6e8-c565d4f08e87,2024-06-11,cbe92596-7f61-5f48-ba3d-d624aa141eaf,Backup,All,US East,Azure VM Protected Instances,0.032256,0.322584,portal,2024,6,11,24,2024-24,2024-06,1,7.907837,2.258088,9.67752,0,5.235928,7.907837
8475,Microsoft Azure Sponsorship,0b1856f5-ffb1-4fb2-9b31-ebc3ddeacf68,2022-08-13,416a0a14-9d61-438f-a875-779e7b0e38c3,Log Analytics,All,BR South,Pay-as-you-go Data Ingestion,5.8e-05,0.000268,ceaec,2022,8,13,32,2022-32,2022-08,5,11.348514,0.001878,0.008034,1,2.964145,6.372193


# 3.9 Export PKL

In [None]:
with open("../data/interim/3_0_feature_engineering.pkl","wb") as arquivo:
    pickle.dump(df, arquivo)