In [1]:
import requests
import datetime as dt
import base64
import time as tm
from zipfile import ZipFile
import logging
import boto3
from botocore.exceptions import ClientError
from azure.storage.blob import BlobServiceClient
import os
from dotenv import load_dotenv, find_dotenv

In [2]:
env_file = find_dotenv()
print(env_file)
load_dotenv(find_dotenv())

/home/david/eafit/Trabajo1_Almdatos/.env


True

In [3]:
def get_current_day_data():
    current_datetime = dt.datetime.now()
    day = current_datetime.day
    month = current_datetime.month
    year = current_datetime.year
    return day, month, year

In [4]:
def get_previous_day_formatted():
    # Get the current date
    current_date = dt.datetime.now()
    
    # Calculate the previous day's date
    previous_day = current_date - dt.timedelta(days=1)
    
    # Format the previous day's date as a string in "dd-mm-yyyy" format
    previous_day_formatted = previous_day.strftime("%d-%m-%Y")
    
    return previous_day_formatted

In [5]:
def upload_file_to_s3(file_name: str, bucket: str, object_name=None):
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client("s3")
    try:
        s3_client.upload_file(
            file_name,
            bucket,
            object_name,
            ExtraArgs={"ACL": "bucket-owner-full-control"},
        )
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [6]:
def upload_file_to_azblob(file_name: str, container_name: str, object_name=None):
    # If object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    blob_service_client = BlobServiceClient.from_connection_string(
        os.getenv("AZURE_STORAGE_CONNECTION_STRING")
    )
    blob_client = blob_service_client.get_blob_client(
        container=container_name, blob=object_name
    )
    try:
        with open(file_name, "rb") as data:
            blob_client.upload_blob(data=data)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [7]:
def extract_file(file_name: str, output_path: str):
    with ZipFile(file_name, "r") as zipFile:
        zipFile.extract("excel.csv.csv", output_path)
    zipFile.close()

In [8]:
def submit_job(field: str, label: str, initial_date=None, final_date=None):
    submit_job_url = "http://dhime.ideam.gov.co/server/rest/services/AtencionCiudadano/DescargarArchivo/GPServer/DescargarArchivo/submitJob"
    headers = {
        "Accept": "*/*",
        "Accept-Language": "en-US,en;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded",
        "Origin": "http://dhime.ideam.gov.co",
        "Referer": "http://dhime.ideam.gov.co/atencionciudadano/",
        "Sec-GPC": "1",
    }

    if not initial_date:
        initial_day, initial_month, initial_year = get_current_day_data()
    else:
        initial_day, initial_month, initial_year = map(int, initial_date.split("-"))
    if not final_date:
        final_day, final_month, final_year = get_current_day_data()
    else:
        final_day, final_month, final_year = map(int, final_date.split("-"))

    data = f"f=json&Filtro=sort%3D%26filter%3D((IdParametro~eq~%27{field}%27~and~Etiqueta~eq~%27{label}%27~and~IdEstacion~eq~%2726205080%27)~or~(IdParametro~eq~%27{field}%27~and~Etiqueta~eq~%27{label}%27~and~IdEstacion~eq~%2727015330%27)~or~(IdParametro~eq~%27{field}%27~and~Etiqueta~eq~%27{label}%27~and~IdEstacion~eq~%2727010810%27))%26group%3D%26fechaInicio%3D{initial_year}-{initial_month}-{initial_day}T05%253A00%253A00.000Z%26fechaFin%3D{final_year}-{final_month}-{final_day}T05%253A00%253A00.000Z%26mostrarGrado%3Dtrue%26mostrarCalificador%3Dtrue%26mostrarNivelAprobacion%3Dtrue&Items=%5B%7B%22IdParametro%22%3A%22{field}%22%2C%22Etiqueta%22%3A%22{label}%22%2C%22EsEjeY1%22%3Afalse%2C%22EsEjeY2%22%3Afalse%2C%22EsTipoLinea%22%3Afalse%2C%22EsTipoBarra%22%3Afalse%2C%22TipoSerie%22%3A%22Estandard%22%2C%22Calculo%22%3A%22%22%7D%2C%7B%22IdParametro%22%3A%22{field}%22%2C%22Etiqueta%22%3A%22{label}%22%2C%22EsEjeY1%22%3Afalse%2C%22EsEjeY2%22%3Afalse%2C%22EsTipoLinea%22%3Afalse%2C%22EsTipoBarra%22%3Afalse%2C%22TipoSerie%22%3A%22Estandard%22%2C%22Calculo%22%3A%22%22%7D%2C%7B%22IdParametro%22%3A%22{field}%22%2C%22Etiqueta%22%3A%22{label}%22%2C%22EsEjeY1%22%3Afalse%2C%22EsEjeY2%22%3Afalse%2C%22EsTipoLinea%22%3Afalse%2C%22EsTipoBarra%22%3Afalse%2C%22TipoSerie%22%3A%22Estandard%22%2C%22Calculo%22%3A%22%22%7D%5D"

    print(
        f"Retrieving information for field {field} from {initial_day}-{initial_month}-{initial_year} to {final_day}-{final_month}-{final_year}..."
    )

    response = requests.get(
        f"{submit_job_url}?{data}",
        headers=headers,
        verify=False,
    )

    print(response.url)

    response_json = response.json()
    job_id = response_json["jobId"]
    job_status = response_json["jobStatus"]

    print(f"Job ID: {job_id}")
    print(f"Job Status: {job_status}")
    return job_id

In [9]:
def verify_status_job(job_id: str):
    headers = {
        "Accept": "*/*",
        "Accept-Language": "en-US,en;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded",
        "Referer": "http://dhime.ideam.gov.co/atencionciudadano/",
        "Sec-GPC": "1",
    }

    timestamp = int(tm.time())

    response = requests.get(
        f"http://dhime.ideam.gov.co/server/rest/services/AtencionCiudadano/DescargarArchivo/GPServer/DescargarArchivo/jobs/{job_id}?f=json&dojo.preventCache={timestamp}",
        headers=headers,
        verify=False,
    )

    response_get_status_json = response.json()
    print(f"Job status: {response_get_status_json['jobStatus']}")
    return response_get_status_json["jobStatus"]

In [10]:
def download_file_from_api(job_id: str, zip_file_name="datos.zip"):
    headers = {
        "Accept": "*/*",
        "Accept-Language": "en-US,en;q=0.9",
        "Content-Type": "application/x-www-form-urlencoded",
        "Referer": "http://dhime.ideam.gov.co/atencionciudadano/",
        "Sec-GPC": "1",
    }

    timestamp = int(tm.time())

    params = {"f": "json", "returnType": "data", "dojo.preventCache": timestamp}

    response = requests.get(
        f"http://dhime.ideam.gov.co/server/rest/services/AtencionCiudadano/DescargarArchivo/GPServer/DescargarArchivo/jobs/{job_id}/results/Archivo",
        params=params,
        headers=headers,
        verify=False,
    )

    print(response.url)

    response_json_get_file = response.json()
    file_content = response_json_get_file["value"]

    if (
        file_content == "Sequence contains no elements"
        or file_content == "No hay información para el rango seleccionado"
    ):
        print("No data available")
        return False

    try:
        decoded_file_content = base64.b64decode(file_content)
        with open(zip_file_name, "wb") as f:
            f.write(decoded_file_content)
        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

In [11]:
def extract_and_upload_file(field: str, label: str, initial_date=None, final_date=None):
    job_id = submit_job(field, label, initial_date, final_date)
    tm.sleep(15)
    verify_status_job(job_id)
    result = download_file_from_api(job_id)
    if not result:
        return
    s3_bucket_name = "climaticchange-datalake"
    container_name = "filesystemclimaticchange"
    zip_file_name = "datos.zip"
    output_path = "datos"
    day, month, year = get_current_day_data()
    object_name = (
        f"datasets_ideam/raw/hist_data/{year}_{month}_{day}_{field}_{label}_datos.csv"
    )
    extract_file(zip_file_name, output_path)
    upload_file_to_s3(f"{output_path}/excel.csv.csv", s3_bucket_name, object_name)
    upload_file_to_azblob(
        f"{output_path}/excel.csv.csv", container_name, object_name
    )

In [12]:
initial_date = get_previous_day_formatted()
fields_and_labels = [
    ("TEMPERATURA", "TMX_CON", initial_date, None),
    ("TEMPERATURA", "TMN_CON", initial_date, None),
    ("HUM+RELATIVA", "HR_CAL_MN_D", initial_date, None),
    ("HUM+RELATIVA", "HR_CAL_MX_D", initial_date, None),
    ("PRECIPITACION", "PT_10_TT_D", initial_date, None),
]
for fields in fields_and_labels:
    extract_and_upload_file(fields[0], fields[1], fields[2], fields[3])

Retrieving information for field TEMPERATURA from 2-9-2023 to 3-9-2023...


http://dhime.ideam.gov.co/server/rest/services/AtencionCiudadano/DescargarArchivo/GPServer/DescargarArchivo/submitJob?f=json&Filtro=sort%3D%26filter%3D((IdParametro~eq~%27TEMPERATURA%27~and~Etiqueta~eq~%27TMX_CON%27~and~IdEstacion~eq~%2726205080%27)~or~(IdParametro~eq~%27TEMPERATURA%27~and~Etiqueta~eq~%27TMX_CON%27~and~IdEstacion~eq~%2727015330%27)~or~(IdParametro~eq~%27TEMPERATURA%27~and~Etiqueta~eq~%27TMX_CON%27~and~IdEstacion~eq~%2727010810%27))%26group%3D%26fechaInicio%3D2023-9-2T05%253A00%253A00.000Z%26fechaFin%3D2023-9-3T05%253A00%253A00.000Z%26mostrarGrado%3Dtrue%26mostrarCalificador%3Dtrue%26mostrarNivelAprobacion%3Dtrue&Items=%5B%7B%22IdParametro%22%3A%22TEMPERATURA%22%2C%22Etiqueta%22%3A%22TMX_CON%22%2C%22EsEjeY1%22%3Afalse%2C%22EsEjeY2%22%3Afalse%2C%22EsTipoLinea%22%3Afalse%2C%22EsTipoBarra%22%3Afalse%2C%22TipoSerie%22%3A%22Estandard%22%2C%22Calculo%22%3A%22%22%7D%2C%7B%22IdParametro%22%3A%22TEMPERATURA%22%2C%22Etiqueta%22%3A%22TMX_CON%22%2C%22EsEjeY1%22%3Afalse%2C%22EsEjeY2