In [1]:
import pandas as pd
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
file_name = "data_2024_prefudi.csv"

# Read the CSV, specifying the separator and skipping bad lines
df = pd.read_csv(
    file_name,
    sep=';',
    encoding='latin-1',
    on_bad_lines='skip'
)

print("File loaded successfully. First 5 rows:")
print(df.head())


File loaded successfully. First 5 rows:
   ANOEMPENHO           DTLANCAMENTO  EMPENHO    FONTE  \
0        2024  2024-01-26 00:00:00.0    10000  1500000   
1        2024  2024-01-26 00:00:00.0    10002  1500000   
2        2024  2024-01-26 00:00:00.0    10004  1500000   
3        2024  2024-01-26 00:00:00.0    10005  1500000   
4        2024  2024-01-26 00:00:00.0    10006  1500000   

                FORNECEDOR       FUNCIONALPROGRAMATICA LICITACAO  \
0  MUNICIPIO DE UBERLANDIA  4 122 7001 999 2230 319011       NaN   
1  MUNICIPIO DE UBERLANDIA  4 122 7001 999 2230 319011       NaN   
2  MUNICIPIO DE UBERLANDIA  4 122 7001 999 2230 319011       NaN   
3  MUNICIPIO DE UBERLANDIA  4 122 7001 999 2669 319011       NaN   
4  MUNICIPIO DE UBERLANDIA  4 122 7001 999 2669 339049       NaN   

                            NOMEORGAO OBJETOSERVICO  \
0  PREFEITURA MUNICIPAL DE UBERLÂNDIA           NaN   
1  PREFEITURA MUNICIPAL DE UBERLÂNDIA           NaN   
2  PREFEITURA MUNICIPAL DE UBERLÂNDIA

In [3]:
print("DataFrame info:", df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32323 entries, 0 to 32322
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ANOEMPENHO             32323 non-null  int64 
 1   DTLANCAMENTO           32323 non-null  object
 2   EMPENHO                32323 non-null  int64 
 3   FONTE                  32323 non-null  int64 
 4   FORNECEDOR             32323 non-null  object
 5   FUNCIONALPROGRAMATICA  32323 non-null  object
 6   LICITACAO              22321 non-null  object
 7   NOMEORGAO              32323 non-null  object
 8   OBJETOSERVICO          22693 non-null  object
 9   SECRETARIA             32323 non-null  object
 10  VALORANULADO           32323 non-null  object
 11  VALOREMPENHADO         32323 non-null  object
 12  VALORLIQUIDADO         32323 non-null  object
 13  VALORPAGO              32323 non-null  object
 14  VALORRETORNADO         32323 non-null  object
dtypes: int64(3), object

In [4]:
print(df['SECRETARIA'].unique())

['SECRETARIA MUNICIPAL DE ADMINISTRAÇÃO'
 'SECRETARIA MUNICIPAL DE FINANÇAS' 'SECRETARIA MUNICIPAL DE EDUCAÇÃO'
 'SECRETARIA MUNICIPAL DE CULTURA E TURISMO'
 'SECRETARIA MUNICIPAL DE SAÚDE'
 'SECRETARIA MUN DE DESENVOLVIMENTO SOCIAL'
 'SECRETARIA MUN DE AGRONEGÓCIO, ECONOMIA E INOVAÇÃO'
 'SECRETARIA MUNICIPAL DE OBRAS'
 'SECRETARIA MUNICIPAL DE TRÂNSITO E TRANSPORTES'
 'SECRETARIA MUNICIPAL DE SERVIÇOS URBANOS'
 'SECRETARIA MUNICIPAL DA JUVENTUDE'
 'SECRETARIA MUNICIPAL DE GESTÃO ESTRATÉGICA'
 'SECRETARIA MUN DE SEGURANÇA INTEGRADA'
 'SECRETARIA MUN DE MEIO AMBIENTE E SUSTENTABILIDADE'
 'PROCURADORIA GERAL DO MUNICIPIO'
 'SECRETARIA MUNICIPAL DE PLANEJAMENTO URBANO'
 'CONTROLADORIA-GERAL DO MUNICÍPIO' 'SECRETARIA MUNICIPAL DE HABITAÇÃO'
 'SECRETARIA MUNICIPAL DE GOVERNO E COMUNICAÇÃO']


In [5]:
df_educacao = df[df['SECRETARIA'] == 'SECRETARIA MUNICIPAL DE EDUCAÇÃO']
print(f"Dataframe para secretária da edução com {len(df_educacao)} linhas")

Dataframe para secretária da edução com 10194 linhas


In [6]:
print("DataFrame info:", df_educacao.info(), df_educacao.head(), sep="\n")

<class 'pandas.core.frame.DataFrame'>
Index: 10194 entries, 8 to 32302
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ANOEMPENHO             10194 non-null  int64 
 1   DTLANCAMENTO           10194 non-null  object
 2   EMPENHO                10194 non-null  int64 
 3   FONTE                  10194 non-null  int64 
 4   FORNECEDOR             10194 non-null  object
 5   FUNCIONALPROGRAMATICA  10194 non-null  object
 6   LICITACAO              4903 non-null   object
 7   NOMEORGAO              10194 non-null  object
 8   OBJETOSERVICO          4933 non-null   object
 9   SECRETARIA             10194 non-null  object
 10  VALORANULADO           10194 non-null  object
 11  VALOREMPENHADO         10194 non-null  object
 12  VALORLIQUIDADO         10194 non-null  object
 13  VALORPAGO              10194 non-null  object
 14  VALORRETORNADO         10194 non-null  object
dtypes: int64(3), object(12)


In [7]:
df_educacao['DTLANCAMENTO'] = pd.to_datetime(df_educacao['DTLANCAMENTO'])

In [8]:
df_educacao['DATA_LANCAMENTO'] = df_educacao['DTLANCAMENTO'].dt.date
df_educacao['HORA_LANCAMENTO'] = df_educacao['DTLANCAMENTO'].dt.time

In [9]:
df_educacao.groupby('DATA_LANCAMENTO').count().reset_index().sort_values(by='DTLANCAMENTO', ascending=False)

Unnamed: 0,DATA_LANCAMENTO,ANOEMPENHO,DTLANCAMENTO,EMPENHO,FONTE,FORNECEDOR,FUNCIONALPROGRAMATICA,LICITACAO,NOMEORGAO,OBJETOSERVICO,SECRETARIA,VALORANULADO,VALOREMPENHADO,VALORLIQUIDADO,VALORPAGO,VALORRETORNADO,HORA_LANCAMENTO
4,2024-01-15,1502,1502,1502,1502,1502,1502,20,1502,20,1502,1502,1502,1502,1502,1502,1502
8,2024-01-19,1016,1016,1016,1016,1016,1016,200,1016,201,1016,1016,1016,1016,1016,1016,1016
157,2024-09-02,263,263,263,263,263,263,263,263,263,263,263,263,263,263,263,263
74,2024-04-26,203,203,203,203,203,203,96,203,96,203,203,203,203,203,203,203
9,2024-01-22,185,185,185,185,185,185,184,185,184,185,185,185,185,185,185,185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,2024-07-25,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
118,2024-07-02,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
200,2024-11-04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
228,2024-12-18,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [10]:
df_educacao.head()

Unnamed: 0,ANOEMPENHO,DTLANCAMENTO,EMPENHO,FONTE,FORNECEDOR,FUNCIONALPROGRAMATICA,LICITACAO,NOMEORGAO,OBJETOSERVICO,SECRETARIA,VALORANULADO,VALOREMPENHADO,VALORLIQUIDADO,VALORPAGO,VALORRETORNADO,DATA_LANCAMENTO,HORA_LANCAMENTO
8,2024,2024-01-26,10015,1500000,MUNICIPIO DE UBERLANDIA,12 122 2005 999 2302 319011,,PREFEITURA MUNICIPAL DE UBERLÂNDIA,,SECRETARIA MUNICIPAL DE EDUCAÇÃO,"R$ 0,0","R$ 5.620,76","R$ 5.620,76","R$ 5.620,76","R$ 0,0",2024-01-26,00:00:00
9,2024,2024-01-26,10016,1500000,MUNICIPIO DE UBERLANDIA,12 122 2005 999 2302 319011,,PREFEITURA MUNICIPAL DE UBERLÂNDIA,,SECRETARIA MUNICIPAL DE EDUCAÇÃO,"R$ 0,0","R$ 14.535,12","R$ 14.535,12","R$ 14.535,12","R$ 0,0",2024-01-26,00:00:00
10,2024,2024-01-26,10017,1500000,MUNICIPIO DE UBERLANDIA,12 361 2001 999 2295 319011,,PREFEITURA MUNICIPAL DE UBERLÂNDIA,,SECRETARIA MUNICIPAL DE EDUCAÇÃO,"R$ 0,0","R$ 16.833,09","R$ 16.833,09","R$ 16.833,09","R$ 0,0",2024-01-26,00:00:00
11,2024,2024-01-26,10023,1500000,MUNICIPIO DE UBERLANDIA,12 361 2001 999 2295 319011,,PREFEITURA MUNICIPAL DE UBERLÂNDIA,,SECRETARIA MUNICIPAL DE EDUCAÇÃO,"R$ 0,0","R$ 256.688,09","R$ 256.688,09","R$ 256.688,09","R$ 0,0",2024-01-26,00:00:00
12,2024,2024-01-26,10026,1500000,MUNICIPIO DE UBERLANDIA,12 365 2002 999 2301 319011,,PREFEITURA MUNICIPAL DE UBERLÂNDIA,,SECRETARIA MUNICIPAL DE EDUCAÇÃO,"R$ 0,0","R$ 2.709,52","R$ 2.709,52","R$ 2.709,52","R$ 0,0",2024-01-26,00:00:00


In [11]:
df_educacao_trat = df_educacao.copy()
column_mapping_convert_float = [
    'VALOREMPENHADO',
    'VALORLIQUIDADO',
    'VALORPAGO'
]
for col in column_mapping_convert_float:
    df_educacao_trat[col] = df_educacao_trat[col].astype(str).str.replace(r'R\$ ?', '', regex=True).str.strip()
    df_educacao_trat[col] = df_educacao_trat[col].str.replace(r'\.', '', regex=True)
    df_educacao_trat[col] = df_educacao_trat[col].str.replace(r',', '.', regex=True)
    df_educacao_trat[col] = df_educacao_trat[col].astype(float)

df_educacao_trat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10194 entries, 8 to 32302
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ANOEMPENHO             10194 non-null  int64         
 1   DTLANCAMENTO           10194 non-null  datetime64[ns]
 2   EMPENHO                10194 non-null  int64         
 3   FONTE                  10194 non-null  int64         
 4   FORNECEDOR             10194 non-null  object        
 5   FUNCIONALPROGRAMATICA  10194 non-null  object        
 6   LICITACAO              4903 non-null   object        
 7   NOMEORGAO              10194 non-null  object        
 8   OBJETOSERVICO          4933 non-null   object        
 9   SECRETARIA             10194 non-null  object        
 10  VALORANULADO           10194 non-null  object        
 11  VALOREMPENHADO         10194 non-null  float64       
 12  VALORLIQUIDADO         10194 non-null  float64       
 13  VALORP

In [None]:
from skimpy import skim


In [13]:
import pandas as pd
from skimpy import skim

# Assume df_educacao_trat is your DataFrame that causes the error

# --- FIX STARTS HERE ---

# 1. Identify string columns that are entirely empty (NaN)
string_cols = df_educacao_trat.select_dtypes(include=['object', 'string']).columns
all_nan_cols = [col for col in string_cols if df_educacao_trat[col].isnull().all()]

if all_nan_cols:
    print(f"Dropping string columns that are completely empty: {all_nan_cols}")
    df_educacao_trat = df_educacao_trat.drop(columns=all_nan_cols)

# 2. Reset the index to ensure it's clean and sequential.
# drop=True prevents the old index from being added as a new column.
df_educacao_trat_cleaned = df_educacao_trat.reset_index(drop=True)

# --- FIX ENDS HERE ---


# Now, run skimpy on the cleaned DataFrame
print("Running skimpy on the cleaned DataFrame...")
skim(df_educacao_trat_cleaned)



Running skimpy on the cleaned DataFrame...


In [14]:
len(df_educacao_trat['FORNECEDOR'].unique())

3766