# **IMPORTS/COMFIGIRAÇÕES**

In [1]:
# Bibliotecas de manipulação
import numpy as np  # Manipulação de matrizes
import pandas as pd  # Manipulação de dados tabulares
# Bibliotecas de visualização gráfica
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Bibliotecas próprias
from PROJECT_LIBRARY.Data_extraction import *  # Biblioteca própria para extração dos dados
from PROJECT_LIBRARY.Data_transformation import *  # Biblioteca própria para transformação dos dados
# Biblioteca de filtro de notificações
import warnings



warnings.filterwarnings(action='ignore')
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
pd.options.display.float_format = lambda x: f'{x:,.2f}'

In [2]:
upgrade_data(fold='./RAW_DATAS')

# **ANÁLISE DESCRITIVA**

In [3]:
df = pd.read_parquet('./DATASETS/finally_data.parquet')
df.head(5)

Unnamed: 0,COMPETÊNCIA,UF,AJUSTES,ESFERA,EC123,COUN,VAAF,VAAR,VAAT,FPE,FPM,ICMS,IPI,IPVA,ITCMD,ITR,LC8796,Ajuste,TOTAL
0,2007-01-01,AC,False,ESTADUAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2007-01-01,AC,False,ESTADUAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3548.46,0.0,0.0,0.0,0.0,0.0,3548.46
5,2007-01-01,AC,False,ESTADUAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3147424.17,0.0,0.0,0.0,0.0,0.0,0.0,3147424.17
6,2007-01-01,AC,False,ESTADUAL,0.0,0.0,0.0,0.0,0.0,0.0,1582026.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1582026.94
7,2007-01-01,AC,False,ESTADUAL,0.0,0.0,0.0,0.0,0.0,9679990.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9679990.36


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110854 entries, 0 to 239098
Data columns (total 19 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   COMPETÊNCIA  110854 non-null  datetime64[ns]
 1   UF           110854 non-null  object        
 2   AJUSTES      110854 non-null  bool          
 3   ESFERA       110854 non-null  object        
 4   EC123        110854 non-null  float64       
 5   COUN         110854 non-null  float64       
 6   VAAF         110854 non-null  float64       
 7   VAAR         110854 non-null  float64       
 8   VAAT         110854 non-null  float64       
 9   FPE          110854 non-null  float64       
 10  FPM          110854 non-null  float64       
 11  ICMS         110854 non-null  float64       
 12  IPI          110854 non-null  float64       
 13  IPVA         110854 non-null  float64       
 14  ITCMD        110854 non-null  float64       
 15  ITR          110854 non-null  float64  

In [5]:
df.memory_usage()

Index          886832
COMPETÊNCIA    886832
UF             886832
AJUSTES        110854
ESFERA         886832
EC123          886832
COUN           886832
VAAF           886832
VAAR           886832
VAAT           886832
FPE            886832
FPM            886832
ICMS           886832
IPI            886832
IPVA           886832
ITCMD          886832
ITR            886832
LC8796         886832
Ajuste         886832
TOTAL          886832
dtype: int64

## VARIÁVEIS NUMÉRICAS

In [6]:
numeric_columns = df.select_dtypes(include=['int', 'float']).columns.tolist()
numeric_columns

['EC123',
 'COUN',
 'VAAF',
 'VAAR',
 'VAAT',
 'FPE',
 'FPM',
 'ICMS',
 'IPI',
 'IPVA',
 'ITCMD',
 'ITR',
 'LC8796',
 'Ajuste',
 'TOTAL']

In [7]:
df.describe()

Unnamed: 0,COMPETÊNCIA,EC123,COUN,VAAF,VAAR,VAAT,FPE,FPM,ICMS,IPI,IPVA,ITCMD,ITR,LC8796,Ajuste,TOTAL
count,110854,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0,110854.0
mean,2015-09-02 06:49:36.152417024,6527.04,1263089.4,713479.57,17131.01,279341.67,2556545.49,2675453.43,12731139.41,145079.99,1182879.24,203124.22,33538.59,41264.2,0.0,21848593.4
min,2007-01-01 00:00:00,0.0,-213329018.65,-177096290.9,0.0,-4017062.82,-13294613.27,-13760903.53,-33113879.07,-941954.25,-11514016.68,-692836.03,-40388.14,0.0,-54682223.97,-213329017.65
25%,2011-05-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4826.85
50%,2015-09-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,810777.05
75%,2020-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13642477.45
max,2024-12-01 00:00:00,121893595.92,777405463.2,891910015.1,41131672.03,590990738.15,250779967.56,283833187.85,2431147109.5,19202111.15,1177690607.31,89383539.05,31457569.15,29611036.12,54682223.94,2431147109.5
std,,571934.63,13839316.74,12976997.52,392404.07,6430150.62,10743007.31,12914324.79,81462957.65,862845.31,14219152.29,1812087.56,516229.81,365372.28,328981.7,85023803.94


### Tratamentos

In [20]:
# Tratamento de constantes 
# cons = df.nunique()
# cons.loc[cons.values==1].index.to_list()
# cons.loc[]
# - (Não foram identificadas constantes entre os dados)

# Tratamento de duplicados
# df.loc[df.duplicated()]
# - (Não foram identificadas registros duplicados entre os dados)

# Tratamento de valores ausentes
# df.loc[df.isnull().any(axis=1)]
# - (Não foram identificadas registros ausentes entre os dados)

# Tratamento de valores inconsistentes ou sujos
# - Foram identificados valores negativos que representam ajustes e não se tratam de irregulares 
# df.loc[(df['TOTAL LIQUIDO'] < 0) & (df['CATEGORIA']!='Outros')]
# df.loc[(df['ESFERA']=='Estadual') & (df['UF']=='PB') & (df['COMPETÊNCIA']>='2023-01-01') & (df['COMPETÊNCIA']<='2023-12-31')]['TOTAL AJUSTE'].sum()
# - Não se sabe o motivo desses ajustes, entretanto, para os montantes corretos de 'total de ajuste' e 'total liquido', estes devem ser considerados negativos

### Distribuição

In [None]:
sns.displot(
    data=df,
    x='TOTAL LIQUIDO',
    col='ESFERA'
)

## VARIÁVEIS CATEGÓRICAS

In [None]:
cat_vars = df.select_dtypes(include=['category']).columns.to_list()
cat_vars

In [None]:
df.describe(include='category')

In [None]:
for cat in cat_vars:
    print(cat, sorted(df[cat].unique().to_list()))

x - Distribuição dos valores mensais (boxplot)

In [None]:
df_temp = df_e.copy()
df_temp = df_temp.resample('m').sum()

fig = px.box(data_frame=df_temp, x=['TOTAL LIQUIDO'], points='all', hover_name=df_temp.index)
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='y', xaxis_tickformat=',.2f')
fig.show()

x - Valor total dos repasses por ano

In [None]:
df_temp = df_e.copy()
df_temp = df_temp.resample(rule='y').sum(numeric_only=True)

fig = px.line(data_frame=df_temp,  y='TOTAL LIQUIDO', x=df_temp.index.year, title='Total de repasses efetuados aos estados por ano de 2007 a 09-2023', markers='.')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='x unified', yaxis_tickformat=',.2s')
fig.show()

x - Valor total dos repasses por mês (tendência, velocidade e aceleração)

In [None]:
df_temp = df_e.copy()
filtro = df_temp.UF == 'AP'
df_temp = df_temp[filtro].resample(rule='M').sum(numeric_only=True)
df_temp['MEDIA MOVEL'] = df_temp['TOTAL LIQUIDO'].rolling(window=12).mean()

fig = px.line(data_frame=df_temp,  y=['TOTAL LIQUIDO', 'MEDIA MOVEL'], x=df_temp.index, title='Total de repasses efetuados aos estados por mês de 01-2007 a 09-2023')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='x unified', yaxis_tickformat=',.2f')
fig.show()

x - Estacionariedade e ruido

In [None]:
df_temp = df_e.copy()
df_temp = df_temp.resample(rule='m').sum(numeric_only=True)
df_temp['DIFERENCIAÇÃO'] = df_temp['TOTAL LIQUIDO'].diff(1)

fig = px.bar(data_frame=df_temp, x=df_temp.index, y=['DIFERENCIAÇÃO'])
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='x unified', yaxis_tickformat=',.2f')
fig.show()

x - Sazonalidade

In [None]:
df_temp = df_e.copy()
df_temp = df_temp.resample(rule='m').sum(numeric_only=True)
df_temp['MEDIA MOVEL'] = df_temp['TOTAL LIQUIDO'].rolling(window=7).mean()
df_temp['SAZONALIDADE'] = df_temp['TOTAL LIQUIDO'].diff(1)
df_temp = df_temp['SAZONALIDADE'].groupby(df_temp.index.month).mean()

fig = px.bar(data_frame=df_temp, x=df_temp.index, y=['SAZONALIDADE'])
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='x unified', yaxis_tickformat=',.2f')
fig.show()

x - Média anual dos repasses por estados

In [None]:
df_temp = df_e.copy()
df_temp = df_temp.groupby('UF').resample(rule='Y').sum(numeric_only=True).reset_index(level=0)
df_temp = df_temp.groupby('UF').mean().sort_values(by='TOTAL LIQUIDO')

fig = px.bar(data_frame=df_temp,  x='TOTAL LIQUIDO', y=df_temp.index, title='Média do total de repasses recebidos por ano para cada estado de 2007 a 10-2023')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='y unified', xaxis_tickformat=',.2f')
fig.show()

x - Média anual dos repasses por fonte

In [None]:
df_temp = df_e.copy()
df_temp = df_temp.groupby('REPASSE').resample(rule='Y').sum(numeric_only=True).reset_index(level=0)
df_temp = df_temp.groupby('REPASSE').mean().sort_values(by='TOTAL LIQUIDO')


fig = px.pie(data_frame=df_temp,  names=df_temp.index, values='TOTAL LIQUIDO', opacity=0.8, hole=0.5, title='Média do total de repasses recebidos por ano para cada fonte de 2007 a 09-2023')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='y unified', xaxis_tickformat=',.2f')
fig.show()

## QUADRO DO PERÍODO ATUAL


x - Total de repasses por mês de 2023

In [None]:
df_temp = df_e.copy()
filter = (df_temp.index.year >= 2023) & (df_temp.index.month <= 10)
df_temp = df_temp[filter].resample(rule='MS').sum(numeric_only=True)
display(df_temp.head())

fig = px.line(data_frame=df_temp, x=df_temp.index, y='TOTAL LIQUIDO')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='y unified', yaxis_tickformat=',.2f')
fig.show()

In [None]:
df_temp = df_e.copy()
filter = (df_temp.UF == 'AP') & (df_temp.index.year >= 2023) & (df_temp.index.month <= 10)
df_temp = df_temp[filter].resample(rule='MS').sum(numeric_only=True)
df_temp['SOMA CUMULATIVA'] = df_temp['TOTAL LIQUIDO'].cumsum()
df_temp.head()

fig = px.bar(data_frame=df_temp,  y=['SOMA CUMULATIVA'], x=df_temp.index, title='Total de repasses efetuados aos estados por mês de 01-2007 a 10-2023')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='x unified', yaxis_tickformat=',.2f')
fig.show()

x - Ranking dos estados com maior valor total de repasses no período

In [None]:
df_temp = df.copy().set_index(keys='COMPETÊNCIA')
df_temp = df_temp.loc[(df_temp.index.year >= 2023) & (df_temp.index.month <= 10)]
df_temp = df_temp[df_temp.ESFERA =='Estadual']
df_temp = df_temp.groupby('UF').resample(rule='Y').sum(numeric_only=True).reset_index(level=0).sort_values(by='TOTAL LIQUIDO')
display(df_temp.head())

fig = px.bar(data_frame=df_temp, y='UF', x='TOTAL LIQUIDO')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='x unified', yaxis_tickformat=',.2f')
fig.show()

x - Ranking do valor total de repasses por fonte

In [None]:
df_temp = df_e.copy()
filter = (df_temp.index.year == 2023) & (df_temp.index.month <= 10)
df_temp = df_temp[filter].groupby('REPASSE').sum(numeric_only=True).loc[['COUN_VAAF', 'COUN_VAAR', 'COUN_VAAT', 'FPE', 'FPM', 'ICMS', 'IPI', 'IPVA', 'ITCMD', 'ITR'], :].sort_values(by='TOTAL LIQUIDO', ascending=False)

fig = px.bar(data_frame=df_temp, x=df_temp.index.get_level_values(0), y='TOTAL LIQUIDO')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode='x unified', yaxis_tickformat=',.2f')
fig.show()