# Imports

In [39]:
import os
import pickle
import datetime
import inflection

# import numpy    as np
import pandas   as pd
# import seaborn  as sns

# from matplotlib import pyplot as plt

# from IPython.display        import Image
# from IPython.core.display   import HTML

# Metodos Sup

In [40]:
def importar_arquivos()-> list:
    """
    Método para importar os DataFrames salvos no formato pkl.
    Por enquanto não vamos receber dados externos, mas no futuro isso será implementado.
    """
    
    arquivos = os.listdir(__CAMINHO_RAW)
    lista_arquivos = [arquivo for arquivo in arquivos if arquivo.endswith('.csv')]
    
    return lista_arquivos

In [41]:
def mundaca_tipo_date(df: pd.DataFrame)-> pd.DataFrame:
    
    df['date'] = pd.to_datetime( df['date'] )
    
    return df

In [42]:
def novas_colunas_date(df:pd.DataFrame)-> pd.DataFrame:
    # year
    df['year'] = df['date'].dt.year
    # month
    df['month'] = df['date'].dt.month
    # day
    df['day'] = df['date'].dt.day
    # week of year
    df['week_of_year'] = df['date'].dt.isocalendar().week
    # year week
    df['year_week'] = df['date'].dt.strftime('%Y-%W')
    # year month
    df['year_month'] = df['date'].dt.strftime('%Y-%m')

    df['data_br'] = df['date'].dt.strftime('%d/%m/%Y')
    
    return df

In [43]:
def exportar_df(df: pd.DataFrame):
    df.to_pickle('../data/external/db_ajustado.pkl')

In [44]:
def renomear_colunas(df: pd.DataFrame)-> pd.DataFrame:
    
    columns_old = df.columns
    snakecase = lambda x : inflection.underscore(x)
    columns_new = list(map(snakecase,columns_old))
    df.columns = columns_new
    
    return df

In [45]:
def mesclar_arquivos(lista_arquivos: list,)-> pd.DataFrame:
    """
    Método para meclar os DataFrames. Aporveito para acrescentar 1 coluna de identificação da tabela.
    Preciso colocar essa identificação para não misturar os dados na hora da análise.
    """
    dfs = pd.DataFrame()
    for arquivo in lista_arquivos:
    
        df = pd.read_csv( os.path.join( __CAMINHO_RAW, arquivo ) )
        maquina = arquivo.split('_')[1]
        df['maquina'] = maquina
    
        df = renomear_colunas(df)
        df = mundaca_tipo_date(df)

        if dfs.empty:
            dfs = df
        else:
            if maquina in dfs['maquina'].values:

                date = dfs.loc[dfs['maquina'] == maquina, 'date']
                filtro = date.max()
                df = df.loc[df['date'] > filtro]
                
                dfs = pd.concat([dfs,df], ignore_index=True)
            else:
                dfs = pd.concat([dfs,df], ignore_index=True)

    return dfs

# Load Data

In [61]:
__CAMINHO_RAW = '../data/raw/'
#__CAMINHO_RAW = 'E:/4_arquivos/1_projeto/modelo_unicin/src/data/raw'

__CAMINHO_INTERIM = '../data/processed'


In [62]:
lista_arquivos = importar_arquivos()

In [63]:
df = mesclar_arquivos(lista_arquivos=lista_arquivos)

In [64]:
df.sample()

Unnamed: 0,subscription_name,subscription_guid,date,resource_guid,service_name,service_type,service_region,service_resource,quantity,cost,maquina
35380,Microsoft Azure Sponsorship,15dc64f3-696a-48fc-9169-8467e3f7bba0,2024-05-02,ed8a651a-e0a3-4de6-a8ae-3b4ce8cb72cf,Storage,Files,All,LRS Data Stored,0.16128,0.009672,unicin


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56933 entries, 0 to 56932
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   subscription_name  56933 non-null  object        
 1   subscription_guid  56933 non-null  object        
 2   date               56933 non-null  datetime64[ns]
 3   resource_guid      56933 non-null  object        
 4   service_name       56933 non-null  object        
 5   service_type       56933 non-null  object        
 6   service_region     56933 non-null  object        
 7   service_resource   56933 non-null  object        
 8   quantity           56933 non-null  float64       
 9   cost               56933 non-null  float64       
 10  maquina            56933 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(8)
memory usage: 4.8+ MB


# Tratamento

In [66]:
df.isnull().sum()

subscription_name    0
subscription_guid    0
date                 0
resource_guid        0
service_name         0
service_type         0
service_region       0
service_resource     0
quantity             0
cost                 0
maquina              0
dtype: int64

In [67]:
df = novas_colunas_date(df)

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56933 entries, 0 to 56932
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   subscription_name  56933 non-null  object        
 1   subscription_guid  56933 non-null  object        
 2   date               56933 non-null  datetime64[ns]
 3   resource_guid      56933 non-null  object        
 4   service_name       56933 non-null  object        
 5   service_type       56933 non-null  object        
 6   service_region     56933 non-null  object        
 7   service_resource   56933 non-null  object        
 8   quantity           56933 non-null  float64       
 9   cost               56933 non-null  float64       
 10  maquina            56933 non-null  object        
 11  year               56933 non-null  int32         
 12  month              56933 non-null  int32         
 13  day                56933 non-null  int32         
 14  week_o

# Export Data

In [69]:
exportar_df(df=df)