In [88]:
import pandas as pd
import numpy as np
import re
from itertools import groupby
from os import listdir, path
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import calendar

In [41]:
#definicões de variáveis
path_inmet = 'datasets/inmet' #caminho para os arquivos do inmet
path_output = 'datasets/agregados' #caminho para onde deseja que as planilhas por estação sejam inseridas
padronizar_dados = False #defina true se deseja que a padronização seja feita (precisa dos arquivos do inmet)

In [43]:
## Conjunto de funções para unificar e padronizar os datasets
colunas_importantes = [0, 1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 18]
index_metadados = ['REGIÃO', 'UF', 'ESTAÇÃO', 'CODIGO (WMO)', 'LATITUDE', 'LONGITUDE', 'ALTITUDE', 'DATA DE FUNDAÇÃO']

 #troca / por - e adiciona o 20 na frente
def formata_data(dt):
    if '/' not in dt: return dt
    d, m, a = dt.split('/')
    return f'20{a}-{m}-{d}'


#troca / por - e remove 'UTC' das strings de hora
def formata_data_hora(mi):
    data, hora = mi[0].replace('/', '-'), mi[1].replace(' UTC', '')
    if len(hora) == 4:  hora = hora[:2] + ':' + hora[2:]
    return data, hora


#faz a leitura de cada arquivo e concatena
def concat_years(code, file_list, output_dir):
    lla_data = [] #geographic data
    sensor_data = [] #sensor data
    for file in file_list:
        #leitura dos sensores
        df = pd.read_csv(file, skiprows=8, encoding='latin_1', sep=';', decimal=',', usecols=colunas_importantes, index_col=[0, 1], na_values=[-9999])
        df = df.rename_axis(['Data', 'Hora']).rename(columns={ df.columns[4]: 'RADIACAO GLOBAL (KJ/m²)'})
        sensor_data.append(df)
        
        #leitura dos dados geograficos
        md = pd.read_csv(file, encoding='latin_1', sep=';', decimal=',', skiprows=4, nrows=3, header=None, usecols=[1], na_values=['F'], names=[df.index[0][0][:4]])
        lla_data.append(md)
    
    #concatena os dados geograficos
    md = pd.concat(lla_data, axis=1, copy=False)
    md.index = ['LATITUDE', 'LONGITUDE', 'ALTITUDE']
    
    #concatena, arruma os index e escreve os dados para um arquivo
    df = pd.concat(sensor_data, copy=False).replace(-9999, np.nan)
    df.index = df.index.map(formata_data_hora)
    
    #salva todas as leituras em um arquivo por sensor
    last = '_'.join(file_list[-1].split('_')[1:5])
    path = f'{output_dir}/{last}.csv'
    md.to_csv(path, sep=';')
    df.sort_index().to_csv(path, sep=';', mode='a')

    
def unify_data(inmet_dir, output_dir):
    #salvando o nome de todas as planilhas
    arquivos = []
    for folder in listdir(inmet_dir):
        c = f'{inmet_dir}/{folder}'
        if not path.isdir(c): continue
        if path.isdir(f'{c}/{folder}'): c = f'{c}/{folder}'
        arquivos += [f'{c}/{a}' for a in listdir(c) if a.endswith('.CSV')]
    
    #para cada codigo, concatena os anos e salva os metadados
    metadatas = []
    search_groups = lambda s: re.search('_([A-Z][0-9]{3})_', s).group(1)
    for k, grupo in groupby(sorted(arquivos, key=search_groups), search_groups):
        concat_years(k, sorted(grupo), output_dir)
        print(f'{k} OK')

In [154]:
 #retorna maximos e mínimos para cada coluna em todos os arquivos
def checking_bounds(path):
    tabelas = {
        'min': [], #minimo de cada coluna
        'max': [], #maximo de cada coluna
    }
      
    for file in listdir(path):
        data = pd.read_csv(path+file, sep=';', index_col = [0, 1], skiprows=4)
        tabelas['min'].append(data.min().rename(file[:-4]))
        tabelas['max'].append(data.max().rename(file[:-4]))
    
    for k, v in tabelas.items():
        df = pd.DataFrame(v)
        df.columns = [f'{k.upper()} - {c}' for c in df.columns]
        tabelas[k] = df
        
    colunas = [v.columns for k, v in tabelas.items()]
    colunas = [j for i in zip(*colunas) for j in i]
    return pd.concat(tabelas.values(), axis=1).reindex(columns=colunas).sort_index()

###dados.apply(lambda x: x.groupby(x.notna().cumsum()).cumcount().max()).rename(local)

In [134]:
#retorna os dias marcados com True se estavam off ou False caso contrário
def days_off(file):
    data = pd.read_csv(file, sep=';', index_col = [0, 1], skiprows=4) \
                        .isna().apply(lambda row: all(row), axis = 1) \
                        .groupby(level=0).apply(lambda group: sum(group) == 24)
    data.name = file[:-4].split('/')[-1]
    return data


# retorna porcentagem de dias que a estação ficou off no ano
def percentage_off_per_year(path):
    stations = pd.concat([days_off(path+file) for file in listdir(path)], axis=1)
    return stations.groupby(lambda x: x.split('-')[0] ).apply(lambda x: x.sum(min_count=1)/ len(x) ).T.sort_index()

In [44]:
if padronizar_dados:
    unify_data(path_inmet, path_output)

A001 OK
A002 OK
A003 OK
A005 OK
A009 OK
A010 OK
A011 OK
A012 OK
A013 OK
A014 OK
A015 OK
A016 OK
A017 OK
A018 OK
A019 OK
A020 OK
A021 OK
A022 OK
A023 OK
A024 OK
A025 OK
A026 OK
A027 OK
A028 OK
A029 OK
A031 OK
A032 OK
A033 OK
A034 OK
A035 OK
A036 OK
A037 OK
A038 OK
A039 OK
A040 OK
A041 OK
A042 OK
A043 OK
A044 OK
A045 OK
A046 OK
A047 OK
A048 OK
A049 OK
A050 OK
A051 OK
A052 OK
A053 OK
A054 OK
A055 OK
A056 OK
A101 OK
A102 OK
A104 OK
A108 OK
A109 OK
A110 OK
A111 OK
A112 OK
A113 OK
A117 OK
A119 OK
A120 OK
A121 OK
A122 OK
A123 OK
A124 OK
A125 OK
A126 OK
A128 OK
A133 OK
A134 OK
A135 OK
A136 OK
A137 OK
A138 OK
A140 OK
A144 OK
A201 OK
A202 OK
A203 OK
A204 OK
A205 OK
A206 OK
A207 OK
A209 OK
A210 OK
A211 OK
A212 OK
A213 OK
A214 OK
A215 OK
A216 OK
A217 OK
A218 OK
A219 OK
A220 OK
A221 OK
A222 OK
A223 OK
A224 OK
A225 OK
A226 OK
A227 OK
A228 OK
A229 OK
A230 OK
A231 OK
A232 OK
A233 OK
A234 OK
A235 OK
A236 OK
A237 OK
A238 OK
A239 OK
A240 OK
A241 OK
A242 OK
A244 OK
A246 OK
A247 OK
A248 OK
A249 OK
A250 OK


In [155]:
bounds = checking_bounds('datasets/agregados/')
bounds.to_csv('datasets/extremos.csv', sep=';')
bounds

Unnamed: 0,"MIN - PRECIPITAÇÃO TOTAL, HORÁRIO (mm)","MAX - PRECIPITAÇÃO TOTAL, HORÁRIO (mm)","MIN - PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)","MAX - PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)",MIN - PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB),MAX - PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB),MIN - PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB),MAX - PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB),MIN - RADIACAO GLOBAL (KJ/m²),MAX - RADIACAO GLOBAL (KJ/m²),MIN - TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C),MAX - TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C),MIN - TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C),MAX - TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C),MIN - UMIDADE REL. MAX. NA HORA ANT. (AUT) (%),MAX - UMIDADE REL. MAX. NA HORA ANT. (AUT) (%),MIN - UMIDADE REL. MIN. NA HORA ANT. (AUT) (%),MAX - UMIDADE REL. MIN. NA HORA ANT. (AUT) (%),"MIN - VENTO, VELOCIDADE HORARIA (m/s)","MAX - VENTO, VELOCIDADE HORARIA (m/s)"
CO_DF_A001_BRASILIA,0.0,70.8,863.4,1023.5,865.3,1008.1,862.8,1007.7,0.0,43969.0,8.6,36.5,7.6,34.4,11.0,100.0,10.0,100.0,0.0,10.4
CO_DF_A042_BRAZLANDIA,0.0,96.0,880.3,897.5,880.8,897.6,880.3,897.3,0.0,4602.3,8.6,36.0,7.9,34.3,12.0,98.0,10.0,98.0,0.0,9.7
CO_DF_A045_AGUAS EMENDADAS,0.0,68.8,889.4,910.4,889.7,910.5,889.4,910.4,0.0,4446.3,5.9,37.8,5.2,35.7,10.0,100.0,9.0,100.0,0.1,8.0
CO_DF_A046_GAMA (PONTE ALTA),0.0,55.2,896.2,914.7,896.6,914.8,896.1,914.5,0.0,4293.2,6.4,37.3,4.9,35.7,10.0,96.0,8.0,96.0,0.0,12.7
CO_DF_A047_PARANOA (COOPA-DF),0.0,70.2,890.9,908.9,890.9,909.0,890.5,908.7,0.0,4292.9,8.1,37.1,7.6,35.3,13.0,100.0,10.0,100.0,0.1,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S_SC_A867_ARARANGUA,0.0,73.0,995.5,1035.2,996.5,1035.5,994.7,1035.1,0.0,5303.6,0.6,38.6,-0.4,36.7,7.0,100.0,12.0,100.0,0.0,17.0
S_SC_A868_ITAJAI,0.0,61.4,997.8,1033.8,998.4,1033.9,997.7,1033.1,0.0,4610.0,2.0,39.0,1.3,37.8,23.0,100.0,18.0,100.0,0.0,11.4
S_SC_A870_RANCHO QUEIMADO,0.0,44.4,901.6,932.3,902.0,932.5,900.9,932.3,0.0,4501.8,-2.7,44.5,-3.6,33.3,22.0,100.0,9.0,100.0,0.1,9.0
S_SC_A895_CHAPECO,0.0,38.0,924.0,951.0,925.7,951.0,923.7,950.8,0.0,4109.2,-0.6,36.8,-1.5,35.1,18.0,98.0,11.0,98.0,0.1,16.1


In [135]:
df = percentage_off_per_year('datasets/agregados/')
df.to_csv(f'datasets/%_dias_off.csv', sep=';')
df

datasets/agregados/NE_CE_A315_BARBALHA.csv
datasets/agregados/N_PA_A213_TOME ACU.csv
datasets/agregados/CO_MS_S705_BRASILANDIA.csv
datasets/agregados/N_AC_A137_MARECHAL THAUMATURGO.csv
datasets/agregados/NE_CE_A360_ACARAU.csv
datasets/agregados/NE_AL_A371_PIRANHAS.csv
datasets/agregados/NE_RN_A340_APODI.csv
datasets/agregados/CO_DF_A042_BRAZLANDIA.csv
datasets/agregados/N_RO_A938_VILHENA.csv
datasets/agregados/NE_MA_A225_IMPERATRIZ.csv
datasets/agregados/CO_MT_A902_TANGARA DA SERRA.csv
datasets/agregados/SE_SP_A727_LINS.csv
datasets/agregados/S_PR_A824_ICARAIMA.csv
datasets/agregados/SE_ES_A631_ECOPORANGA.csv
datasets/agregados/S_RS_A804_SANTANA DO LIVRAMENTO.csv
datasets/agregados/N_TO_A049_COLINAS DO TOCANTINS.csv
datasets/agregados/NE_PB_A310_AREIA.csv
datasets/agregados/S_SC_A814_URUSSANGA.csv
datasets/agregados/S_RS_A879_CANELA.csv
datasets/agregados/N_AM_A110_BOCA DO ACRE.csv
datasets/agregados/NE_BA_A442_EUCLIDES DA CUNHA.csv
datasets/agregados/NE_MA_A206_CHAPADINHA.csv
datasets

datasets/agregados/SE_MG_A520_CONCEICAO DAS ALAGOAS.csv
datasets/agregados/SE_MG_A564_DIVINOPOLIS.csv
datasets/agregados/NE_CE_A305_FORTALEZA.csv
datasets/agregados/CO_MS_S703_BANDEIRANTES.csv
datasets/agregados/NE_MA_A237_CAXIAS.csv
datasets/agregados/S_PR_B806_COLOMBO.csv
datasets/agregados/CO_MS_S708_FATIMA DO SUL.csv
datasets/agregados/S_RS_A803_SANTA MARIA.csv
datasets/agregados/S_RS_A838_CAMAQUA.csv
datasets/agregados/N_TO_A040_MATEIROS.csv
datasets/agregados/SE_MG_A562_PATOS DE MINAS.csv
datasets/agregados/CO_MS_A710_PARANAIBA.csv
datasets/agregados/SE_MG_A516_PASSOS.csv
datasets/agregados/SE_MG_A566_ARACUAI.csv
datasets/agregados/S_SC_A868_ITAJAI.csv
datasets/agregados/N_PA_A241_CONCEICAO DO ARAGUAIA.csv
datasets/agregados/NE_PI_A338_ESPERANTINA.csv
datasets/agregados/NE_PB_A373_ITAPORANGA.csv
datasets/agregados/SE_SP_A716_OURINHOS.csv
datasets/agregados/CO_MT_A936_SALTO DO CEU.csv
datasets/agregados/S_RS_A889_SAO VICENTE DO SUL.csv
datasets/agregados/NE_PE_A307_PETROLINA.csv
d

datasets/agregados/SE_MG_A518_JUIZ DE FORA.csv
datasets/agregados/CO_MS_A760_COSTA RICA.csv
datasets/agregados/SE_MG_A523_PATROCINIO.csv
datasets/agregados/N_AM_A120_AUTAZES.csv
datasets/agregados/N_PA_A240_MARABA.csv
datasets/agregados/NE_MA_A218_PREGUICAS.csv
datasets/agregados/NE_PE_A322_GARANHUNS.csv
datasets/agregados/CO_GO_A056_CRISTALINA (FAZENDA SANTA MONICA).csv
datasets/agregados/N_PA_A233_SANTANA DO ARAGUAIA.csv
datasets/agregados/S_RS_A812_CACAPAVA DO SUL.csv
datasets/agregados/CO_MT_A943_SERRA NOVA DOURADA.csv
datasets/agregados/S_RS_A882_TEUTONIA.csv
datasets/agregados/N_TO_A019_GURUPI.csv
datasets/agregados/S_PR_A825_GOIOERE.csv
datasets/agregados/NE_PI_A308_PARNAIBA.csv
datasets/agregados/CO_GO_A028_IPORA.csv
datasets/agregados/NE_RN_A344_CALCANHAR.csv
datasets/agregados/SE_RJ_A603_DUQUE DE CAXIAS - XEREM.csv
datasets/agregados/CO_MT_A934_ALTO TAQUARI.csv
datasets/agregados/SE_SP_A708_FRANCA.csv
datasets/agregados/SE_MG_A527_TEOFILO OTONI.csv
datasets/agregados/N_RO_A94

datasets/agregados/NE_PB_A334_MONTEIRO.csv
datasets/agregados/S_RS_A887_CAPAO DO LEAO (PELOTAS).csv
datasets/agregados/NE_PI_A336_ALVORADA DO GURGUEIA.csv
datasets/agregados/S_RS_A802_RIO GRANDE.csv
datasets/agregados/NE_BA_A410_ILHEUS.csv
datasets/agregados/SE_SP_A744_BRAGANCA PAULISTA.csv
datasets/agregados/SE_SP_A725_AVARE.csv
datasets/agregados/N_PA_A235_NOVO REPARTIMENTO.csv
datasets/agregados/S_SC_A863_ITUPORANGA.csv
datasets/agregados/N_AM_S121_SANTA ISABEL DO RIO NEGRO.csv
datasets/agregados/S_RS_A837_SOLEDADE.csv
datasets/agregados/SE_MG_A533_GUANHAES.csv
datasets/agregados/CO_MT_A932_GUIRATINGA.csv
datasets/agregados/S_RS_A826_ALEGRETE.csv
datasets/agregados/S_PR_A819_CASTRO.csv
datasets/agregados/S_RS_A883_IBIRUBA.csv
datasets/agregados/CO_GO_A036_CRISTALINA.csv
datasets/agregados/CO_MT_A944_ROSARIO OESTE.csv
datasets/agregados/N_PA_A252_DOM ELISEU.csv
datasets/agregados/NE_SE_A453_NOSSA SENHORA DA GLORIA.csv
datasets/agregados/SE_MG_A552_SALINAS.csv
datasets/agregados/SE_SP

Data,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
NE_CE_A315_BARBALHA,,,,,,,,0.002740,0.000000,0.000000,...,0.000000,0.000000,0.060274,0.000000,0.000000,0.452055,0.000000,0.000000,0.000000,0.446575
N_PA_A213_TOME ACU,,,,,,,,0.021918,0.016393,0.049315,...,0.000000,0.000000,0.115068,0.449315,0.008197,0.000000,0.000000,0.000000,0.000000,0.049315
CO_MS_S705_BRASILANDIA,,,,,,,,,,,...,,,,,,,0.043836,0.421918,1.000000,1.000000
N_AC_A137_MARECHAL THAUMATURGO,,,,,,,,,,0.002740,...,0.120219,0.139726,0.887671,0.000000,0.046448,0.520548,0.490411,0.095890,0.617486,0.969863
NE_CE_A360_ACARAU,,,,,,,,,,0.000000,...,0.000000,0.000000,0.106849,0.000000,0.166667,0.000000,0.000000,0.191781,0.860656,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NE_PI_A330_PAULISTANA,,,,,,,,0.008219,0.000000,0.000000,...,0.000000,0.000000,0.013699,0.068493,0.000000,0.063014,0.000000,0.000000,0.000000,0.000000
SE_MG_A534_AIMORES,,,,,,,,0.010959,0.000000,0.000000,...,0.472678,0.000000,0.000000,0.000000,0.000000,0.005479,0.169863,0.000000,0.661202,0.000000
SE_MG_A537_DIAMANTINA,,,,,,,,0.008219,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005479,0.000000,0.000000,0.000000
S_SC_A806_FLORIANOPOLIS,,,,0.167123,0.491803,0.380822,0.052055,0.071233,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [197]:
path = 'datasets/agregados/'
todos = []
for file in listdir(path):
    data = pd.read_csv(path+file, sep=';', index_col = [0], nrows=3).iloc[:, -1]
    loc = file[:-4].split('_')
    data['REGIÃO'] = loc[0]
    data['ESTADO'] = loc[1]
    data['NOME'] = loc[3]
    data['CODIGO'] = loc[2]
    #data = data.rename(loc[2])
    todos.append(data)
coords = pd.concat(todos, axis=1).T
coords

# mapa do brasil
import plotly.express as px
fig = px.density_mapbox(coords, 
                        lat='LATITUDE', 
                        lon='LONGITUDE', 
                        radius=5,
                        center={'lat':-15, 'lon':-60}, 
                        zoom=3,
                        hover_data=["NOME", "CODIGO", "ESTADO"],
                        height=800,
                        mapbox_style="carto-darkmatter"
                       )

fig.update_layout(
    title_text = 'Estações INMET',
    geo_scope = 'south america'                 # partir em regioes
)
fig.show()