In [1]:
import requests
import re
import json
from csv import writer 
from io import StringIO
from lxml import etree

In [3]:
url = "https://www.senamhi.gob.pe/mapas/mapa-estaciones-2/_dato_esta_tipo02.php?"
estaciones = [
    {
        "departamento": "LIMA",#
        "id": "111290",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "553",
    },
    {
        "departamento": "JUNIN",#
        "id": "472DF5D6",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "4648",
    },
    {
        "departamento": "HUANUCO",#
        "id": "109094",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EAMA",
        "alt": "3574",
    },
    {
        "departamento": "PASCO",#faltan meses
        "id": "110137",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EAMA",
        "alt": "3848",
    },
    {
        "departamento": "AYACUCHO",#faltan meses
        "id": "114121",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "4203",
    },
    {
        "departamento": "APURIMAC",#
        "id": "114121",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "4203",
    },
    {
        "departamento": "AREQUIPA",#
        "id": "4729E39A",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "1498",
    },
    {
        "departamento": "HUANCAVELICA",#
        "id": "4720CDF8",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EAMA",
        "alt": "3886",
    },
    {
        "departamento": "ICA",#
        "id": "47255188",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EAMA",
        "alt": "324",
    },
    {
        "departamento": "LA LIBERTAD",#
        "id": "4727319A",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "4047",
    },
    {
        "departamento": "ANCASH",#
        "id": "47259496",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "3431",
    },
    {
        "departamento": "LAMBAYEQUE",#
        "id": "47E2F720",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "181",
    },
    {
        "departamento": "CUSCO",#
        "id": "472976F8",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "2921",
    },
    {
        "departamento": "CAJAMARCA",#
        "id": "4726A602",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "2622",
    },
    {
        "departamento": "MOQUEGUA",#
        "id": "4723F1BE",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "1420",
    },
    {
        "departamento": "PIURA",#
        "id": "472F7636",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "128",
    },
    {
        "departamento": "TACNA",#
        "id": "47E2D1CC",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "3100",
    },
    {
        "departamento": "TUMBES",#
        "id": "472F264A",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "113",
    },
    {
        "departamento": "AMAZONAS",#empieza 2018-04
        "id": "472C57D4",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EAMA",
        "alt": "400",
    },
    {
        "departamento": "SAN MARTIN",
        "id": "4724851A",
        "oldId": "",
        "estado": "AUTOMATICA",
        "cate_esta": "EMA",
        "alt": "882",
    },
]


In [8]:
def queryString(estacion, year, month):
    dataDate = str(year) + str(month) if month > 9 else str(year) + "0" + str(month)
    return "CBOFiltro={date}&estaciones={estacionId}&t_e=M&estado=AUTOMATICA&cod_old={estacionOldId}&cate_esta={cate_esta}&soloAlt={soloAlt}".format(
        date=dataDate,
        estacionId=str(estacion["id"]),
        estacionOldId=str(estacion["oldId"]),
        soloAlt=str(estacion["alt"]),
        cate_esta=str(estacion["cate_esta"]),
    )


In [48]:
from types import NoneType

reg = re.compile('(?<=(<table width="100))(?s).*(?=(table>))')

output = StringIO()
csv_writer = writer(output)

csv_writer.writerow(
    [
        "DEPARTAMENTO",
        "FECHA",
        "HORA",
        "TEMPERATURA",
        "PRECIPITACIÓN(mm/hora)",
        "HUMEDAD(%)",
        "DIRECCION DEL VIENTO (°)",
        "VELOCIDAD DEL VIENTO (m/s)",
    ]
)
no_data = {}
month_missing = {}
for estacion in estaciones:
    no_data[estacion["id"]] = []
    month_missing[estacion["id"]] = []
    for year in range(2018, 2023):
        for month in range(1, 13):
            if year == 2022 and month > 8:
                break

            getUrl = url + queryString(estacion, year, month)
            response = requests.get(getUrl, headers={"User-Agent": "Mozilla/5.0"})
            body = response.text

            table = etree.HTML(
                '<table width="100'
                + reg.search(body).group(0)  # .replace(" ", "").replace("\n", "")
            ).find("body/table")
            etree.strip_tags(table, etree.Comment)

            filtered_rows = []
            new_row = []
            for row in table.getiterator():

                values = [col.text.replace(" ", "").replace("\n", "") for col in row]
                if (
                    len(values) == 1
                    and (type(values[0]) != NoneType or type(values[0]) != "")
                    # and values[0] != "TEMPERATURA(°C)"
                    # and values[0] != "HUMEDAD(%)"
                ):
                    new_row.append(values[0])
                    # if values[0] == "S/D" and new_row[0] not in no_data[estacion["id"]]:
                    #     no_data[estacion["id"]].append(new_row[0])

                if len(new_row) == 7:
                    if (new_row[2] == "S/D"):
                        no_data[estacion["id"]].append(new_row[0]+' - '+new_row[1])
                    filtered_rows.append([estacion["departamento"], *new_row])
                    new_row = []

            filtered_rows = filtered_rows[3:]
            if len(filtered_rows) == 0:
                month_missing[estacion["id"]].append(str(year)+'/'+str(month))


            for row in filtered_rows:
                csv_writer.writerow(row)


In [49]:
no_data

{'111290': ['2018/08/13 - 12:00',
  '2018/08/16 - 13:00',
  '2018/08/21 - 23:00',
  '2018/08/22 - 06:00',
  '2018/10/14 - 22:00',
  '2018/12/02 - 11:00',
  '2019/02/22 - 14:00',
  '2019/02/22 - 15:00',
  '2019/04/27 - 14:00',
  '2019/05/04 - 15:00',
  '2019/05/05 - 15:00',
  '2019/05/10 - 11:00',
  '2019/05/19 - 08:00',
  '2019/05/20 - 04:00',
  '2019/05/24 - 19:00',
  '2019/05/30 - 00:00',
  '2019/06/15 - 07:00',
  '2019/06/16 - 01:00',
  '2019/06/18 - 05:00',
  '2019/06/18 - 13:00',
  '2019/06/18 - 19:00',
  '2019/06/19 - 12:00',
  '2019/06/19 - 14:00',
  '2019/06/20 - 18:00',
  '2020/03/24 - 22:00',
  '2020/04/02 - 11:00',
  '2020/04/03 - 10:00',
  '2020/04/07 - 10:00',
  '2020/04/08 - 10:00',
  '2020/04/09 - 10:00',
  '2020/04/14 - 11:00',
  '2020/04/15 - 09:00',
  '2020/04/16 - 09:00',
  '2020/04/18 - 09:00',
  '2020/04/23 - 10:00',
  '2020/04/25 - 09:00',
  '2020/04/28 - 10:00',
  '2020/05/03 - 10:00',
  '2020/05/04 - 11:00',
  '2020/05/05 - 11:00',
  '2020/05/07 - 10:00',
  '202

In [50]:
month_missing

{'111290': [],
 '472DF5D6': [],
 '109094': [],
 '110137': ['2018/1',
  '2018/2',
  '2018/3',
  '2019/6',
  '2019/7',
  '2020/3',
  '2020/4',
  '2020/5',
  '2020/6',
  '2020/7',
  '2020/8'],
 '114121': ['2018/6',
  '2018/7',
  '2018/8',
  '2018/9',
  '2018/10',
  '2018/11',
  '2018/12',
  '2019/1',
  '2019/2',
  '2019/3',
  '2019/4',
  '2019/5',
  '2019/6',
  '2019/7',
  '2019/8',
  '2020/5',
  '2020/6',
  '2020/7',
  '2020/8',
  '2020/9',
  '2020/10',
  '2020/11'],
 '4729E39A': [],
 '4720CDF8': [],
 '47255188': [],
 '4727319A': [],
 '47259496': [],
 '47E2F720': [],
 '472976F8': [],
 '4726A602': [],
 '4723F1BE': [],
 '472F7636': [],
 '47E2D1CC': [],
 '472F264A': [],
 '472C57D4': ['2018/1', '2018/2'],
 '4724851A': []}

In [44]:
output.seek(0)
with open('test.csv', mode='w') as f:
    for line in output.getvalue().split('\n'):
        f.write(line)