In [100]:
import requests
import re
import json
from csv import writer 
from io import StringIO
from lxml import etree

In [87]:
url = "https://www.senamhi.gob.pe/mapas/mapa-estaciones-2/_dato_esta_tipo02.php?"
estaciones = [
    {
        "departamento": "LIMA",
        "id": "113140",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "PE",
        "soloAlt": "302",
    },
    {
        "departamento": "JUNIN",
        "id": "112037",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "3186",
    },
    {
        "departamento": "HUANUCO",
        "id": "109021",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "3032",
    },
    {
        "departamento": "PASCO",
        "id": "110037",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "4357",
    },
    {
        "departamento": "AYACUCHO",
        "id": "100127",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "3247",
    },
    {
        "departamento": "APURIMAC",
        "id": "113225",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "soloAlt": "2772",
    },
    {
        "departamento": "AREQUIPA",
        "id": "116017",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "soloAlt": "2326",
    },
    {
        "departamento": "HUANCAVELICA",
        "id": "112065",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "3303",
    },
    {
        "departamento": "ICA",
        "id": "114008",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "soloAlt": "407",
    },
    {
        "departamento": "LA LIBERTAD",
        "id": "108045",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "2892",
    },
    {
        "departamento": "ANCASH",
        "id": "109018",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "2466",
    },
    {
        "departamento": "LAMBAYEQUE",
        "id": "106047",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "78",
    },
    {
        "departamento": "CUSCO",
        "id": "100044",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "soloAlt": "3214",
    },
    {
        "departamento": "CAJAMARCA",
        "id": "107028",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "soloAlt": "2673",
    },
    {
        "departamento": "MOQUEGUA",
        "id": "116021",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "3109",
    },
    {
        "departamento": "PIURA",
        "id": "105064",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "2232",
    },
    {
        "departamento": "TACNA",
        "id": "117003",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CP",
        "soloAlt": "785",
    },
    {
        "departamento": "TUMBES",
        "id": "103040",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "50",
    },
    {
        "departamento": "AMAZONAS",
        "id": "106011",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "2442",
    },
    {
        "departamento": "SAN MARTIN",
        "id": "106014",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "soloAlt": "879",
    },
]


In [88]:
def queryString(estacion, year, month):
    dataDate = str(year) + str(month) if month > 9 else str(year) + "0" + str(month)
    return "CBOFiltro={date}&estaciones={estacionId}&t_e=M&estado=REAL&cod_old={estacionOldId}&cate_esta={cate_esta}&soloAlt={soloAlt}".format(
        date=dataDate,
        estacionId=str(estacion["id"]),
        estacionOldId=str(estacion["oldId"]),
        soloAlt=str(estacion["soloAlt"]),
        cate_esta=str(estacion["cate_esta"]),
    )


In [98]:
from types import NoneType

reg = re.compile('(?<=(<table width="100))(?s).*(?=(table>))')

output = StringIO()
csv_writer = writer(output)

csv_writer.writerow(
    [
        "DEPARTAMENTO",
        "FECHA",
        "TEMPERATURA MAX",
        "TEMPERATURA MIN",
        "HUMEDADRELATIVA(%)",
        "PRECIPITACIÓN(mm/día)",
    ]
)
no_data = {}
for estacion in estaciones:
    no_data[estacion["id"]] = []
    for year in range(2017, 2023):
        for month in range(1, 13):
            if year == 2022 and month > 8:
                break

            getUrl = url + queryString(estacion, 2022, 8)
            response = requests.get(getUrl)
            body = response.text

            table = etree.HTML(
                '<table width="100'
                + reg.search(body).group(0).replace(" ", "").replace("\n", "")
            ).find("body/table")
            etree.strip_tags(table, etree.Comment)

            filtered_rows = []
            new_row = []
            for row in table.getiterator():
                values = [col.text for col in row]
                if (
                    len(values) == 1
                    and type(values[0]) != NoneType
                    and values[0] != "TEMPERATURA(°C)"
                    and values[0] != "TOTAL"
                ):
                    new_row.append(values[0])
                    if values[0] == "S/D" and new_row[0] not in no_data[estacion["id"]]:
                        no_data[estacion["id"]].append(new_row[0])

                if len(new_row) == 5:
                    filtered_rows.append([estacion["departamento"], *new_row])
                    new_row = []

            # filtered_rows[0] = [
            #     "DEPARTAMENTO",
            #     "AÑO/MES/DÍA",
            #     "MAX",
            #     "MIN",
            #     "HUMEDADRELATIVA(%)",
            #     "PRECIPITACIÓN(mm/día)",
            # ]

            filtered_rows = filtered_rows[1:]

            for row in filtered_rows:
                csv_writer.writerow(row)


In [106]:
no_data

{'113140': [],
 '112037': [],
 '109021': ['2022-08-31'],
 '110037': [],
 '100127': [],
 '113225': [],
 '116017': [],
 '112065': ['2022-08-12'],
 '114008': [],
 '108045': [],
 '109018': [],
 '106047': ['2022-08-06'],
 '100044': [],
 '107028': [],
 '116021': [],
 '105064': [],
 '117003': [],
 '103040': [],
 '106011': [],
 '106014': []}

In [None]:
output.seek(0)
with open('test.csv', mode='w') as f:
    for line in output.getvalue().split('\n'):
        f.write(line)