# 0 Imports

In [1]:
import pickle
import datetime

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot

from IPython.display        import Image
from IPython.core.display   import HTML

In [2]:
# Supressão da notação científica.
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', '{:.6f}'.format)

## 0.1 Funções Suporte

In [3]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [12,6]
    plt.rcParams['font.size'] = 20
    
    display( HTML('<style>.container { width: 100% !important;} </style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False)
    
    sns.set()
    
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [4]:
def visualizacao_dados_categoricos(df):
    for i in df:
        print(f'Atributo: {i}') 
        print(f'Valores Únicos: {len(df[i].sort_values().unique())}')
        print(f'Valores Descritos: {df[i].sort_values().unique().tolist()}\n')

## 0.2 Load Data

In [5]:
with open("../data/interim/db_ajustado.pkl","rb") as arquivo:
    df = pickle.load(arquivo)
df.sample()

Unnamed: 0,subscription_name,subscription_guid,date,resource_guid,service_name,service_type,service_region,service_resource,quantity,cost,maquina,year,month,day,week_of_year,year_week,year_month
31435,Microsoft Azure Sponsorship,82375e17-0dda-4790-b6e8-c565d4f08e87,2023-11-21,42e2a22f-fabf-41ea-a21e-e86bb3ff5799,Storage,Standard Page Blob,All,Disk Read Operations,0.2884,9.6e-05,portal,2023,11,21,47,2023-47,2023-11


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56933 entries, 0 to 56932
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   subscription_name  56933 non-null  object        
 1   subscription_guid  56933 non-null  object        
 2   date               56933 non-null  datetime64[ns]
 3   resource_guid      56933 non-null  object        
 4   service_name       56933 non-null  object        
 5   service_type       56933 non-null  object        
 6   service_region     56933 non-null  object        
 7   service_resource   56933 non-null  object        
 8   quantity           56933 non-null  float64       
 9   cost               56933 non-null  float64       
 10  maquina            56933 non-null  object        
 11  year               56933 non-null  int32         
 12  month              56933 non-null  int32         
 13  day                56933 non-null  int32         
 14  week_o

## 0.3 Planejamento para criação de novas Features

# 3.0 F.E.

In [7]:
df.columns

Index(['subscription_name', 'subscription_guid', 'date', 'resource_guid',
       'service_name', 'service_type', 'service_region', 'service_resource',
       'quantity', 'cost', 'maquina', 'year', 'month', 'day', 'week_of_year',
       'year_week', 'year_month'],
      dtype='object')

In [8]:
colunas = ['service_name', 'service_type', 'service_region', 'service_resource','maquina']
colunas_categoricas = df.loc[:,colunas]

In [9]:
visualizacao_dados_categoricos(colunas_categoricas)

Atributo: service_name
Valores Únicos: 15
Valores Descritos: ['Automation', 'Azure App Service', 'Azure DNS', 'Azure Monitor', 'Backup', 'Bandwidth', 'Functions', 'Key Vault', 'Log Analytics', 'Logic Apps', 'Network Watcher', 'Storage', 'Virtual Machines', 'Virtual Machines Licenses', 'Virtual Network']

Atributo: service_type
Valores Únicos: 24
Valores Descritos: ['All', 'BS Series', 'BS Series Windows', 'Basv2 Series', 'Dv3/DSv3 Series', 'Files', 'Files v2', 'Free Plan', 'General Block Blob', 'IP Addresses', 'Inter-Region', 'Premium', 'Premium Page Blob', 'Premium SSD Managed Disks', 'Private Link', 'Process', 'Queues v2', 'SQL Server Azure Hybrid Benefit', 'SQL Server Developer Edition', 'Standard HDD Managed Disks', 'Standard Page Blob', 'Standard SSD Managed Disks', 'Tables', 'Tiered Block Blob']

Atributo: service_region
Valores Únicos: 7
Valores Descritos: ['All', 'BR South', 'Global', 'Intercontinental', 'North America', 'South America', 'US East']

Atributo: service_resource
V

In [10]:
df2 = df.copy()
df2 = df2.sort_values(by=['maquina', 'date']) #Garantir a sequencia ordenada de máquina e data

In [11]:
df2['dia_da_semana'] = df2['date'].dt.dayofweek.astype(int)

In [12]:
df2['fim_de_semana'] = df2['dia_da_semana'].apply(lambda x : 1 if x >= 5 else 0)

In [13]:
df2['custo_diario'] = df2.groupby(['maquina','date'])['cost'].transform('sum')

In [14]:
df2['custo_media_movel_semanal'] = df2.groupby(
                                    ['maquina']
                                    )['cost'].transform(
                                                lambda x: x.rolling(
                                                    window=7,
                                                    min_periods=1
                                                    ).sum()
                                                )

In [19]:
df2['custo_media_movel_semanal_teste'] = df2.groupby(
                                                    ['maquina']
                                                    )['cost'].transform(
                                                                lambda x: x.rolling(
                                                                    window=7,
                                                                    min_periods=1
                                                                    ).sum()
                                                                ).reset_index(
                                                                    level=0,
                                                                    drop=True
                                                                    )

In [47]:
df2['custo_media_movel_semanal_data'] = df2.groupby(
                                            ['maquina',
                                            'date'])['cost'].transform(
                                                        lambda x: x.rolling(
                                                            window=7,
                                                            min_periods=1
                                                            ).sum()
                                                        )

In [40]:
df2['custo_media_movel_mensal'] = df2.groupby(
                                    ['maquina']
                                    )['cost'].transform(
                                                lambda x: x.rolling(
                                                    window=30,
                                                    min_periods=1
                                                    ).sum()
                                                )

In [49]:
aux2 = df2.loc[(df2['date']>='2024-09-12') & (df2['maquina'] == 'ceaec'),:].groupby(['maquina','date'])['cost'].sum().reset_index()

In [50]:
aux2['cost'].sum()

208.966287

In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56933 entries, 0 to 56272
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   subscription_name          56933 non-null  object        
 1   subscription_guid          56933 non-null  object        
 2   date                       56933 non-null  datetime64[ns]
 3   resource_guid              56933 non-null  object        
 4   service_name               56933 non-null  object        
 5   service_type               56933 non-null  object        
 6   service_region             56933 non-null  object        
 7   service_resource           56933 non-null  object        
 8   quantity                   56933 non-null  float64       
 9   cost                       56933 non-null  float64       
 10  maquina                    56933 non-null  object        
 11  year                       56933 non-null  int32         
 12  month    

In [52]:
aux= df2.loc[(df2['maquina']=='unicin') & (df2['date']=='2024-10-11'),
        ['date',
        'maquina',
        'cost',
        'custo_diario',
        'custo_media_movel_semanal',
        'custo_media_movel_semanal_teste']]
aux

Unnamed: 0,date,maquina,cost,custo_diario,custo_media_movel_semanal,custo_media_movel_semanal_teste
32135,2024-10-11,unicin,0.080598,4.242862,0.915558,0.163302
33077,2024-10-11,unicin,0.0,4.242862,0.675558,1.266491
35542,2024-10-11,unicin,0.007657,4.242862,0.538687,0.163251
36545,2024-10-11,unicin,0.267403,4.242862,0.64481,2.152373
37548,2024-10-11,unicin,0.148602,4.242862,0.504404,2.219016
38123,2024-10-11,unicin,0.0684,4.242862,0.57266,2.330395
39117,2024-10-11,unicin,0.0,4.242862,0.57266,0.182758
40024,2024-10-11,unicin,1.089,4.242862,1.581062,4.878868
41503,2024-10-11,unicin,0.073764,4.242862,1.654826,2.253994
44027,2024-10-11,unicin,0.0,4.242862,1.647169,0.002252


In [21]:
aux['cost'].sum()

5.485406

In [41]:
aux_df = df2.loc[(df2['maquina']=='unicin') & (df2['date']>='2024-09-12'),
        ['date',
        'maquina',
        'cost',
        'custo_media_movel_semanal',
        'custo_media_movel_mensal']
        ].groupby(['maquina','date']).sum().sort_values(by='date',ascending=False).reset_index()
aux_df

Unnamed: 0,maquina,date,cost,custo_media_movel_semanal,custo_media_movel_mensal
0,unicin,2024-10-11,4.242862,30.172204,5.153284
1,unicin,2024-10-10,5.6293,39.4051,5.634637
2,unicin,2024-10-09,5.64062,39.48434,5.622057
3,unicin,2024-10-08,5.600468,39.203276,5.490353
4,unicin,2024-10-07,5.365676,37.559732,5.341884
5,unicin,2024-10-06,5.314549,37.201843,5.325528
6,unicin,2024-10-05,5.361065,37.527455,5.291303
7,unicin,2024-10-04,5.23878,36.67146,5.282548
8,unicin,2024-10-03,5.283107,36.981749,8.022691
9,unicin,2024-10-02,11.152205,78.065435,8.432701


In [38]:
aux_df.loc[:,'cost'].sum()

167.62592

In [39]:
aux_df.loc[aux_df['date']>='2024-10-05',['cost']].sum()

cost   37.154540
dtype: float64

In [36]:
df2.sample(5)

Unnamed: 0,subscription_name,subscription_guid,date,resource_guid,service_name,service_type,service_region,service_resource,quantity,cost,maquina,year,month,day,week_of_year,year_week,year_month,dia_da_semana,fim_de_semana,custo_diario,custo_media_movel_semanal,custo_media_movel_mensal,custo_media_movel_semanal_teste
27198,Microsoft Azure Sponsorship,82375e17-0dda-4790-b6e8-c565d4f08e87,2023-10-03,c80a3636-2edb-4248-bcb1-04ef818a75ac,Storage,Standard Page Blob,All,Disk Write Operations,0.1441,4.8e-05,portal,2023,10,3,40,2023-40,2023-10,1,0,6.169693,4.847763,11.416634,0.172795
29075,Microsoft Azure Sponsorship,82375e17-0dda-4790-b6e8-c565d4f08e87,2024-08-07,fec1d8d1-cc81-4d54-a391-83e228df4928,Storage,Premium SSD Managed Disks,BR South,P4 LRS Disk,0.032256,0.294192,portal,2024,8,7,32,2024-32,2024-08,2,0,10.258578,6.149229,30.166377,0.599258
52377,Microsoft Azure Sponsorship,15dc64f3-696a-48fc-9169-8467e3f7bba0,2023-04-18,3f2b1e1c-c886-4ec6-ad6f-dd0ef38819c9,Storage,Tables,All,LRS Data Stored,0.00012,0.0,unicin,2023,4,18,16,2023-16,2023-04,1,0,5.119435,2.505862,7.639146,3.367156
39296,Microsoft Azure Sponsorship,15dc64f3-696a-48fc-9169-8467e3f7bba0,2022-04-07,39edab1a-b691-4088-9bb2-6ffe51d91da4,Storage,Premium SSD Managed Disks,US East,P6 LRS Disk,0.033336,0.340272,unicin,2022,4,7,14,2022-14,2022-04,3,0,8.576854,1.145734,8.574713,0.002046
10238,Microsoft Azure Sponsorship,0b1856f5-ffb1-4fb2-9b31-ebc3ddeacf68,2022-02-17,ae331802-83a5-4b9e-b287-85675f794e3d,Virtual Machines,Dv3/DSv3 Series,US East,D2 v3/D2s v3,24.0,2.304,ceaec,2022,2,17,7,2022-07,2022-02,3,0,11.980041,2.527999,14.328645,1.889398


# 3.9 Export PKL

In [None]:
with open("../data/interim/3_0_feature_engineering.pkl","wb") as arquivo:
    pickle.dump(df, arquivo)