In [1]:
import requests
import re
import json
from csv import writer 
from io import StringIO
from lxml import etree

In [2]:
url = "https://www.senamhi.gob.pe/mapas/mapa-estaciones-2/_dato_esta_tipo02.php?"
estaciones = [
    {
        "departamento": "LIMA",
        "id": "113140",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "PE",
        "alt": "302",
    },
    {
        "departamento": "JUNIN",
        "id": "112037",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "3186",
    },
    {
        "departamento": "HUANUCO",
        "id": "109021",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "3032",
    },
    {
        "departamento": "PASCO",
        "id": "110037",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "4357",
    },
    {
        "departamento": "AYACUCHO",
        "id": "100127",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "3247",
    },
    {
        "departamento": "APURIMAC",
        "id": "113225",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "alt": "2772",
    },
    {
        "departamento": "AREQUIPA",
        "id": "116017",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "alt": "2326",
    },
    {
        "departamento": "HUANCAVELICA",
        "id": "112065",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "3303",
    },
    {
        "departamento": "ICA",
        "id": "114008",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "alt": "407",
    },
    {
        "departamento": "LA LIBERTAD",
        "id": "108045",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "2892",
    },
    {
        "departamento": "ANCASH",
        "id": "109018",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "2466",
    },
    {
        "departamento": "LAMBAYEQUE",
        "id": "106047",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "78",
    },
    {
        "departamento": "CUSCO",
        "id": "100044",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "alt": "3214",
    },
    {
        "departamento": "CAJAMARCA",
        "id": "107028",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "MAP",
        "alt": "2673",
    },
    {
        "departamento": "MOQUEGUA",
        "id": "116021",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "3109",
    },
    {
        "departamento": "PIURA",
        "id": "105064",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "2232",
    },
    {
        "departamento": "TACNA",
        "id": "117003",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CP",
        "alt": "785",
    },
    {
        "departamento": "TUMBES",
        "id": "103040",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "50",
    },
    {
        "departamento": "AMAZONAS",
        "id": "106011",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "2442",
    },
    {
        "departamento": "SAN MARTIN",
        "id": "106014",
        "oldId": "",
        "estado": "REAL",
        "cate_esta": "CO",
        "alt": "879",
    },
]

In [3]:
def queryString(estacion, year, month):
    dataDate = str(year) + str(month) if month > 9 else str(year) + "0" + str(month)
    return "CBOFiltro={date}&estaciones={estacionId}&t_e=M&estado=REAL&cod_old={estacionOldId}&cate_esta={cate_esta}&soloAlt={soloAlt}".format(
        date=dataDate,
        estacionId=str(estacion["id"]),
        estacionOldId=str(estacion["oldId"]),
        soloAlt=str(estacion["soloAlt"]),
        cate_esta=str(estacion["cate_esta"]),
    )


In [14]:
from types import NoneType

reg = re.compile('(?<=(<table width="100))(?s).*(?=(table>))')

output = StringIO()
csv_writer = writer(output)

csv_writer.writerow(
    [
        "DEPARTAMENTO",
        "FECHA",
        "TEMPERATURA MAX",
        "TEMPERATURA MIN",
        "HUMEDADRELATIVA(%)",
        "PRECIPITACIÓN(mm/día)",
    ]
)
no_data = {}
for estacion in estaciones:
    no_data[estacion["id"]] = []
    for year in range(2017, 2023):
        for month in range(1, 13):
            if year == 2022 and month > 8:
                break

            getUrl = url + queryString(estacion, year, month)
            response = requests.get(getUrl)
            body = response.text

            table = etree.HTML(
                '<table width="100'
                + reg.search(body).group(0).replace(" ", "").replace("\n", "")
            ).find("body/table")
            etree.strip_tags(table, etree.Comment)

            filtered_rows = []
            new_row = []
            for row in table.getiterator():
                values = [col.text for col in row]
                if (
                    len(values) == 1
                    and type(values[0]) != NoneType
                    and values[0] != "TEMPERATURA(°C)"
                    and values[0] != "TOTAL"
                ):
                    new_row.append(values[0])
                    # if values[0] == "S/D" and new_row[0] not in no_data[estacion["id"]]:
                    #     no_data[estacion["id"]].append(new_row[0])

                if len(new_row) == 5:
                    if (new_row[2] == "S/D" or new_row[3] == "S/D") and new_row[0] not in no_data[estacion["id"]]:
                        no_data[estacion["id"]].append(new_row[0])
                    filtered_rows.append([estacion["departamento"], *new_row])
                    new_row = []

            # filtered_rows[0] = [
            #     "DEPARTAMENTO",
            #     "AÑO/MES/DÍA",
            #     "MAX",
            #     "MIN",
            #     "HUMEDADRELATIVA(%)",
            #     "PRECIPITACIÓN(mm/día)",
            # ]

            filtered_rows = filtered_rows[1:]

            for row in filtered_rows:
                csv_writer.writerow(row)


In [15]:
no_data

{'113140': ['2017-02-19',
  '2017-03-25',
  '2018-09-29',
  '2019-11-03',
  '2019-12-30',
  '2020-01-22',
  '2020-02-13',
  '2020-03-08',
  '2020-03-11',
  '2020-03-15',
  '2020-03-18',
  '2020-03-31',
  '2020-04-01',
  '2020-04-02',
  '2020-04-04',
  '2020-04-05',
  '2020-04-06',
  '2020-04-07',
  '2020-04-14',
  '2020-04-15',
  '2020-04-16',
  '2020-04-17',
  '2020-04-18',
  '2020-04-19',
  '2020-04-20',
  '2020-04-21',
  '2020-04-22',
  '2020-04-23',
  '2020-04-24',
  '2020-04-25',
  '2020-04-26',
  '2020-04-27',
  '2020-04-28',
  '2020-04-29',
  '2020-04-30',
  '2020-05-01',
  '2020-05-02',
  '2020-05-03',
  '2020-05-04',
  '2020-05-05',
  '2020-05-06',
  '2020-05-07',
  '2020-05-08',
  '2020-05-09',
  '2020-05-10',
  '2020-05-11',
  '2020-05-12',
  '2020-05-13',
  '2020-05-14',
  '2020-05-15',
  '2020-05-16',
  '2020-05-17',
  '2020-05-18',
  '2020-05-19',
  '2020-05-20',
  '2020-05-21',
  '2020-05-22',
  '2020-05-23',
  '2020-05-24',
  '2020-05-25',
  '2020-05-26',
  '2020-05-28'

In [13]:
output.seek(0)
with open('test.csv', mode='w') as f:
    for line in output.getvalue().split('\n'):
        f.write(line)