# Alex irá documentar

In [None]:
#Importação de pacotes
import pandas as pd
import numpy as np
import re
from itertools import groupby
from os import listdir, path
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import calendar

In [13]:
#definicões de variáveis
path_inmet = 'datasets/inmet' #caminho para os arquivos do inmet
path_output = 'datasets/agregados' #caminho para onde deseja que as planilhas por estação sejam inseridas
padronizar_dados = True #defina true se deseja que a padronização seja feita (precisa dos arquivos do inmet)

In [14]:
## Conjunto de funções para unificar e padronizar os datasets
colunas_importantes = [0, 1, 2, 6, 9, 10, 13, 14, 18]
index_metadados = ['REGIÃO', 'UF', 'ESTAÇÃO', 'CODIGO (WMO)', 'LATITUDE', 'LONGITUDE', 'ALTITUDE', 'DATA DE FUNDAÇÃO']

 #troca / por - e adiciona o 20 na frente
def formata_data(dt):
    if '/' not in dt: return dt
    d, m, a = dt.split('/')
    return f'20{a}-{m}-{d}'


#troca / por - e remove 'UTC' das strings de hora
def formata_data_hora(mi):
    data, hora = mi[0].replace('/', '-'), mi[1].replace(' UTC', '')
    if len(hora) == 4:  hora = hora[:2] + ':' + hora[2:]
    return data, hora


#faz a leitura de cada arquivo e concatena
def concat_years(code, file_list, output_dir):
    lla_data = [] #geographic data
    sensor_data = [] #sensor data
    for file in file_list:
        #leitura dos sensores
        df = pd.read_csv(file, skiprows=8, encoding='latin_1', sep=';', decimal=',', usecols=colunas_importantes, index_col=[0, 1], na_values=[-9999])
        df = df.rename_axis(['Data', 'Hora']).rename(columns={ df.columns[1]: 'RADIACAO GLOBAL (KJ/m²)'})
        sensor_data.append(df)
        
        #leitura dos dados geograficos
        md = pd.read_csv(file, encoding='latin_1', sep=';', decimal=',', skiprows=4, nrows=3, header=None, usecols=[1], na_values=['F'], names=[df.index[0][0][:4]])
        lla_data.append(md)
    
    #concatena os dados geograficos
    md = pd.concat(lla_data, axis=1, copy=False)
    md.index = ['LATITUDE', 'LONGITUDE', 'ALTITUDE']
    
    #concatena, arruma os index e escreve os dados para um arquivo
    df = pd.concat(sensor_data, copy=False).replace(-9999, np.nan)
    df.index = df.index.map(formata_data_hora)
    
    #salva todas as leituras em um arquivo por sensor
    last = '_'.join(file_list[-1].split('_')[1:5])
    path = f'{output_dir}/{last}.csv'
    md.to_csv(path, sep=';')
    df.sort_index().to_csv(path, sep=';', mode='a')

    
def unify_data(inmet_dir, output_dir):
    #salvando o nome de todas as planilhas
    arquivos = []
    for folder in listdir(inmet_dir):
        c = f'{inmet_dir}/{folder}'
        if not path.isdir(c): continue
        if path.isdir(f'{c}/{folder}'): c = f'{c}/{folder}'
        arquivos += [f'{c}/{a}' for a in listdir(c) if a.endswith('.CSV')]
    
    #para cada codigo, concatena os anos e salva os metadados
    metadatas = []
    search_groups = lambda s: re.search('_([A-Z][0-9]{3})_', s).group(1)
    for k, grupo in groupby(sorted(arquivos, key=search_groups), search_groups):
        concat_years(k, sorted(grupo), output_dir)
        print(f'{k} OK')

In [15]:
 #retorna maximos e mínimos para cada coluna em todos os arquivos
def checking_bounds(path):
    tabelas = {
        'min': [], #minimo de cada coluna
        'max': [], #maximo de cada coluna
    }
      
    for file in listdir(path):
        data = pd.read_csv(path+file, sep=';', index_col = [0, 1], skiprows=4)
        tabelas['min'].append(data.min().rename(file[:-4]))
        tabelas['max'].append(data.max().rename(file[:-4]))
    
    for k, v in tabelas.items():
        df = pd.DataFrame(v)
        df.columns = [f'{k.upper()} - {c}' for c in df.columns]
        tabelas[k] = df
        
    colunas = [v.columns for k, v in tabelas.items()]
    colunas = [j for i in zip(*colunas) for j in i]
    return pd.concat(tabelas.values(), axis=1).reindex(columns=colunas).sort_index()

###dados.apply(lambda x: x.groupby(x.notna().cumsum()).cumcount().max()).rename(local)

In [16]:
#retorna os dias marcados com True se estavam off ou False caso contrário
def days_off(file):
    data = pd.read_csv(file, sep=';', index_col = [0, 1], skiprows=4) \
                        .isna().apply(lambda row: all(row), axis = 1) \
                        .groupby(level=0).apply(lambda group: sum(group) == 24)
    data.name = file[:-4].split('/')[-1]
    return data


# retorna porcentagem de dias que a estação ficou off no ano
def percentage_off_per_year(path):
    stations = pd.concat([days_off(path+file) for file in listdir(path)], axis=1)
    return stations.groupby(lambda x: x.split('-')[0] ).apply(lambda x: x.sum(min_count=1)/ len(x) ).T.sort_index()

In [17]:
if padronizar_dados:
    unify_data(path_inmet, path_output)

A001 OK
A002 OK
A003 OK
A005 OK
A011 OK
A012 OK
A013 OK
A014 OK
A015 OK
A016 OK
A017 OK
A022 OK
A023 OK
A024 OK
A025 OK
A026 OK
A027 OK
A028 OK
A029 OK
A031 OK
A032 OK
A033 OK
A034 OK
A035 OK
A036 OK
A037 OK
A042 OK
A045 OK
A046 OK
A047 OK
A056 OK
A702 OK
A703 OK
A704 OK
A709 OK
A710 OK
A717 OK
A719 OK
A720 OK
A721 OK
A722 OK
A723 OK
A724 OK
A730 OK
A731 OK
A732 OK
A742 OK
A743 OK
A749 OK
A750 OK
A751 OK
A752 OK
A754 OK
A756 OK
A757 OK
A758 OK
A759 OK
A760 OK
A761 OK
A901 OK
A902 OK
A903 OK
A904 OK
A905 OK
A906 OK
A907 OK
A908 OK
A909 OK
A910 OK
A911 OK
A912 OK
A913 OK
A914 OK
A915 OK
A917 OK
A919 OK
A920 OK
A921 OK
A922 OK
A923 OK
A924 OK
A926 OK
A927 OK
A928 OK
A929 OK
A930 OK
A931 OK
A932 OK
A933 OK
A934 OK
A935 OK
A936 OK
A937 OK
A941 OK
A942 OK
A943 OK
A944 OK
S701 OK
S702 OK
S703 OK
S704 OK
S705 OK
S706 OK
S707 OK
S708 OK
S709 OK
S710 OK
S711 OK
S712 OK
S713 OK
S714 OK
S715 OK
S716 OK
S717 OK


In [18]:
bounds = checking_bounds('datasets/agregados/')
bounds.to_csv('datasets/extremos.csv', sep=';')
bounds

Unnamed: 0,"MIN - PRECIPITAÇÃO TOTAL, HORÁRIO (mm)","MAX - PRECIPITAÇÃO TOTAL, HORÁRIO (mm)",MIN - RADIACAO GLOBAL (KJ/m²),MAX - RADIACAO GLOBAL (KJ/m²),MIN - TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C),MAX - TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C),MIN - TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C),MAX - TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C),MIN - UMIDADE REL. MAX. NA HORA ANT. (AUT) (%),MAX - UMIDADE REL. MAX. NA HORA ANT. (AUT) (%),MIN - UMIDADE REL. MIN. NA HORA ANT. (AUT) (%),MAX - UMIDADE REL. MIN. NA HORA ANT. (AUT) (%),"MIN - VENTO, VELOCIDADE HORARIA (m/s)","MAX - VENTO, VELOCIDADE HORARIA (m/s)"
CO_DF_A001_BRASILIA,0.0,37.6,0.0,4205.3,9.4,36.5,8.5,34.4,12.0,97.0,10.0,97.0,0.1,7.1
CO_DF_A042_BRAZLANDIA,0.0,96.0,0.0,4168.7,11.6,35.8,10.5,34.3,12.0,97.0,10.0,96.0,0.0,9.7
CO_DF_A045_AGUAS EMENDADAS,0.0,42.4,0.0,4265.0,6.9,37.8,6.0,35.6,11.0,96.0,9.0,96.0,0.1,5.9
CO_DF_A046_GAMA (PONTE ALTA),0.0,41.2,0.0,4017.2,7.7,37.3,6.5,35.7,12.0,95.0,9.0,95.0,0.0,9.1
CO_DF_A047_PARANOA (COOPADF),0.0,70.2,0.0,4155.9,10.0,37.1,9.1,35.3,17.0,100.0,14.0,100.0,0.1,8.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CO_MT_A937_PONTES E LACERDA,0.0,60.0,0.0,3950.4,8.1,40.5,7.8,38.3,13.0,95.0,11.0,95.0,0.0,8.2
CO_MT_A941_CACERES,0.0,35.2,0.0,4279.1,10.1,42.2,9.7,40.9,11.0,100.0,9.0,100.0,0.1,7.0
CO_MT_A942_SAO JOSE DO XINGU,0.0,53.6,0.0,4269.8,14.6,39.5,13.4,38.7,15.0,99.0,13.0,99.0,0.1,10.1
CO_MT_A943_SERRA NOVA DOURADA,0.0,67.2,0.0,4150.9,18.6,40.1,18.2,38.1,,,,,0.1,14.5


In [19]:
df = percentage_off_per_year('datasets/agregados/')
df.index = df.index.str.split('_', expand=True).set_names(['REGIÃO', 'ESTADO', 'CODIGO', 'NOME'])
df = df.reorder_levels(['REGIÃO', 'ESTADO', 'NOME', 'CODIGO'])
df.to_csv(f'datasets/%_dias_off.csv', sep=';')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,01,02,03,04,05,06,07,08,09,10,...,22,23,24,25,26,27,28,29,30,31
REGIÃO,ESTADO,NOME,CODIGO,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
CO,DF,BRASILIA,A001,,,,,,,,,,,...,,,,,,,,,,
CO,DF,BRAZLANDIA,A042,,,,,,,,,,,...,,,,,,,,,,
CO,DF,AGUAS EMENDADAS,A045,,,,,,,,,,,...,,,,,,,,,,
CO,DF,GAMA (PONTE ALTA),A046,,,,,,,,,,,...,,,,,,,,,,
CO,DF,PARANOA (COOPADF),A047,,,,,,,,,,,...,,,,,,,,,,
CO,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CO,MT,PONTES E LACERDA,A937,,,,,,,,,,,...,,,,,,,,,,
CO,MT,CACERES,A941,,,,,,,,,,,...,,,,,,,,,,
CO,MT,SAO JOSE DO XINGU,A942,,,,,,,,,,,...,,,,,,,,,,
CO,MT,SERRA NOVA DOURADA,A943,,,,,,,,,,,...,,,,,,,,,,


In [21]:
path = 'datasets/agregados/'
todos = []
for file in listdir(path):
    data = pd.read_csv(path+file, sep=';', index_col = [0], nrows=3).iloc[:, -1]
    loc = file[:-4].split('_')
    data['REGIÃO'] = loc[0]
    data['ESTADO'] = loc[1]
    data['NOME'] = loc[3]
    data['CODIGO'] = loc[2]
    #data = data.rename(loc[2])
    todos.append(data)
coords = pd.concat(todos, axis=1).T.set_index(['REGIÃO', 'ESTADO', 'NOME', 'CODIGO'])
coords

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,LATITUDE,LONGITUDE,ALTITUDE
REGIÃO,ESTADO,NOME,CODIGO,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CO,DF,BRASILIA,A001,-15.789343,-47.925756,1160.96
CO,DF,BRAZLANDIA,A042,-15.599722,-48.131111,1143.0
CO,DF,AGUAS EMENDADAS,A045,-15.596491,-47.625801,1030.36
CO,DF,GAMA (PONTE ALTA),A046,-15.935278,-48.1375,990.0
CO,DF,PARANOA (COOPADF),A047,-16.012222,-47.557417,1043.0
CO,...,...,...,...,...,...
CO,MT,PONTES E LACERDA,A937,-15.234582,-59.346215,272.53
CO,MT,CACERES,A941,-16.074722,-57.693056,123.53
CO,MT,SAO JOSE DO XINGU,A942,-10.484167,-52.3725,300.0
CO,MT,SERRA NOVA DOURADA,A943,-11.987778,-51.426111,441.0


In [22]:
df2 = pd.concat([coords, df], axis=1)
df2.to_csv('')

FileNotFoundError: [Errno 2] No such file or directory: ''

In [24]:
funcs = {
    'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)': lambda x: x.sum(min_count=1),
    'RADIACAO GLOBAL (KJ/m²)': 'mean',
    'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': 'max' ,
    'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)': 'min' ,
    'UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)': 'max' ,
    'UMIDADE REL. MIN. NA HORA ANT. (AUT) (%)': 'min' ,
    'VENTO, VELOCIDADE HORARIA (m/s)': 'mean'
}

i = 0
for file in listdir(path_output):
    print(i)
    i += 1
    path_in = f'{path_output}/{file}'
    path_out = f'datasets/diarios/{file}'
    
    pd.read_csv(path_in, sep=';', nrows=3, index_col=[0]) \
            .to_csv(path_out, sep=';')
    
    pd.read_csv( path_in, sep=';', index_col=[0, 1], skiprows=4) \
            .groupby(level=0).apply(lambda group: group.agg(funcs)) \
            .to_csv(path_out, sep=';', mode='a')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113


In [25]:
#unir arquivos por região
path = 'datasets/diarios/'
todos = []
for file in listdir(path):
    path_file = f'{path}{file}'
    regiao, estado, codigo, nome = file[:-4].split('_')

    loc = pd.read_csv(path_file, sep=';', nrows=3, index_col=[0])
    df = pd.read_csv( path_file, sep=';', index_col=[0], skiprows=4)
    
    for ano in loc:
        indexes = df.index.str.startswith(ano)
        for j in ['LATITUDE', 'LONGITUDE', 'ALTITUDE']:
            df.loc[indexes, j] = loc[ano][j]
        
    df['REGIAO'] = regiao
    df['ESTADO'] = estado
    df['CODIGO'] = codigo
    df['NOME'] = nome

    todos.append( df.set_index(['REGIAO', 'ESTADO', 'CODIGO', 'NOME', 'LATITUDE', 'LONGITUDE', 'ALTITUDE'], append=True) )
pd.concat(todos).to_csv('datasets/consolidado.csv', sep=';')