# Web Scrapping

In [79]:
import sys
import os
from functools import reduce

project_path = os.path.abspath('..')
sys.path.insert(1, project_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from requests.auth import HTTPBasicAuth
import json
import datetime
import re

from dotenv import load_dotenv, find_dotenv

In [5]:
dotenv_path = find_dotenv()

load_dotenv(dotenv_path)

True

In [7]:
def unix_to_datetime(unix_date):
    return pd.to_datetime(unix_date, unit='s')

def datetime_to_unix(datetime):
    return int(datetime.timestamp())

## Extracción de datos desde API

In [34]:
auth_url = os.environ.get('AUTH_URL')
db_url = os.environ.get('DATABASE_URL')
username = os.environ.get("USERNAME")
password = os.environ.get("PASSWORD")

class DatabaseMMA:
    def __init__(self):
        self.s = requests.Session()
        self.s.post(auth_url, data=f'username={username}&password={password}')

        self.parameters = {'SO2':'0001', 'NO':'0002', 'NO2':'0003', 'CO':'0004', 'O3':'0008',
                            'Cu':'00Cu', 'Pb':'00Pb', 'CH4':'0CH4', 'NOX':'0NOX', 'As':'0HCM',
                            'CH6':'0CH6', 'CH7':'0CH7', 'MP10':'PM10', 'MP25':'PM25', 'tempdiff':'DTMP',
                            'rad':'GLOB', 'pres':'PRES', 'precip':'RAIN', 'relhum':'RHUM', 'temp':'TEMP',
                            'dirviento':'WDIR', 'velviento':'WSPD'}

        self.stations = {'Alto Hospicio':'117', 'Antofagasta':'237', 'Copiapo sivica':'332', 'Huasco Sivica':'333', 'Andacollo':'420',
                        'Cuncumen SIVICA':'424', 'La Serena':'425', 'Coquimbo':'426', 'La Greda':'503', 'Los Maitenes':'504', 'Puchuncaví':'505',
                        'Sur':'506', 'Valle Alegre':'507', 'Met Principal':'508', 'Concon':'509', 'Colmo':'511', 'Las Gaviotas':'512',
                        'Vina del mar':'529', 'Los Andes':'532', 'Junta de Vecinos':'535', 'Centro Quintero':'539', 'Quintero':'540',
                        'Central Quintero':'546', 'Loncura':'547', 'Ventanas':'548', 'Valparaiso':'550', 'Concon MMA':'560', 'Rancagua':'609',
                        'Rengo': '611', 'San Fernando':'612', 'Rancagua II':'615', 'La Florida_Talca':'703', 'Curico':'709', 'U.C. Maule':'710',
                        'Universidad de Talca':'711', 'Linares':'713', 'Consultorio - San Vicente':'802', 'Inia-Chillan':'810', 'Kingston College':'827',
                        'Liceo Polivalente':'830', 'Cerro Merquin':'831', 'Balneario Curanilahue':'832', 'Meteorológico, Hualqui':'834', 'Hualqui':'841',
                        'Puntera':'854', 'Puren':'873', 'Los Angeles Oriente':'874', '21 de Mayo':'875', 'Las Encinas':'901', 'Padre Las Casas II':'902',
                        'Ferroviario':'904', 'Nielol':'905', 'Osorno':'A01', 'MIRASOL_sivica':'A07', 'Alerce':'A08', 'Coyhaique':'B03', 'Coyhaique 2':'B04',
                        'Vialidad':'B05', 'Punta Arenas':'C05', 'Independencia':'D11', 'La Florida':'D12', 'Las Condes':'D13', "Parque O'Higgins":'D14',
                        'Pudahuel':'D15', 'Cerrillos':'D16', 'El Bosque':'D17', 'Cerro Navia':'D18', 'Puente Alto':'D27', 'Talagante':'D28', 'Quilicura':'D29',
                        'Quilicura II':'D30', 'Cerrillos Movil':'D31', 'Cerrillos Movil2':'D35', 'Valdivia':'E03', 'La Union':'E04', 'Valdivia 2':'E08', 'Arica':'F01'}

    def get_resolution(self):
        response = self.s.get(db_url + '/domain/SMA/resolution')
        df = pd.DataFrame(response.json()['data']).drop('links',axis=1)
        display(df)

    def get_available_timeseries(self, resolution):
        response = self.s.get(db_url + f'/domain/SMA/resolution/{resolution}/timeserie')
        df = pd.DataFrame(response.json()['data']).drop('links',axis=1)
        return df

    def get_stations(self):
        response = self.s.get(db_url + f'/domain/SMA/station')
        df = pd.DataFrame(response.json()['data']).drop('links',axis=1)
        return df.sort_values(by='name')

    def station_from_id(self, id):
        return list(self.stations.keys())[list(self.stations.values()).index(id)]

    def parameter_from_id(self, id):
        return list(self.stations.keys())[list(self.stations.values()).index(id)]    

    def get_station_details(self, station):
        response = self.s.get(db_url + f'/domain/SMA/station/{self.stations[station]}')
        df = pd.DataFrame(response.json()['data'])
        return df

    def get_timeserie(self, station, par, from_date, to_date, res='+', date_format='default'):
        from_date = datetime_to_unix(from_date)
        to_date = datetime_to_unix(to_date)
        timeserie = f'{self.stations[station]}{res}M{self.parameters[par]}VAL/{from_date}/{to_date}'
        response = self.s.get(db_url + f'/domain/SMA/timeserie/{timeserie}')
        df = pd.DataFrame(response.json()['data']['timeserie']).drop('statusCode', axis=1)
        last_date = df.timestamp.values[-1]
        while (last_date != to_date) & (last_date < to_date):
            timeserie = f'{self.stations[station]}{res}M{self.parameters[par]}VAL/{last_date}/{to_date}'
            response = self.s.get(db_url + f'/domain/SMA/timeserie/{timeserie}')
            temp_df = pd.DataFrame(response.json()['data']['timeserie']).drop('statusCode', axis=1)
            df = pd.concat([df, temp_df])
            last_date = df.timestamp.values[-1] 

        if date_format == 'default':
            dates = df.timestamp.apply(unix_to_datetime)
            df['timestamp'] = dates
        df.rename({'value':par}, axis=1, inplace=True)
        return df

In [35]:
db = DatabaseMMA()

### Listado de estaciones

In [38]:
db.get_stations()

Unnamed: 0,name,id
47,21 de Mayo,875
54,Alerce,A08
0,Alto Hospicio,117
4,Andacollo,420
1,Antofagasta,237
...,...,...
12,Valle Alegre,507
25,Valparaiso,550
24,Ventanas,548
57,Vialidad,B05


### Series de tiempo disponibles

In [95]:
db.get_available_timeseries('+')

Unnamed: 0,id,begin,end
0,503+M0001VAL,1356498000,1479308400
1,503+M0002VAL,1356498000,1479308400
2,503+M0003VAL,1356498000,1479308400
3,503+M0008VAL,1356498000,1479308400
4,503+M0NOXVAL,1356498000,1479308400
...,...,...,...
74,560+MPM25VAL,1522684800,1664974800
75,560+MRHUMVAL,1556254800,1664974800
76,560+MTEMPVAL,1556254800,1664974800
77,560+MWDIRVAL,1556254800,1664974800


In [78]:
timeseries = db.get_available_timeseries('+')

In [83]:
timeseries['parameter'] = timeseries.id.apply(lambda x: re.findall('(?<=M).+(?=VAL)', x)[0])
timeseries['station'] = timeseries.id.apply(lambda x: re.findall('\d+(?=\+M)', x)[0])

In [97]:
db.stations['Quintero']

'540'

In [98]:
timeseries.loc[timeseries.station == 540]

Unnamed: 0,id,begin,end,station,parameter


In [96]:
timeseries

Unnamed: 0,id,begin,end,station,parameter
0,503+M0001VAL,1356498000,1479308400,503,0001
1,503+M0002VAL,1356498000,1479308400,503,0002
2,503+M0003VAL,1356498000,1479308400,503,0003
3,503+M0008VAL,1356498000,1479308400,503,0008
4,503+M0NOXVAL,1356498000,1479308400,503,0NOX
...,...,...,...,...,...
74,560+MPM25VAL,1522684800,1664974800,560,PM25
75,560+MRHUMVAL,1556254800,1664974800,560,RHUM
76,560+MTEMPVAL,1556254800,1664974800,560,TEMP
77,560+MWDIRVAL,1556254800,1664974800,560,WDIR


In [67]:
re.findall('(?<=M).+(?=VAL)', timeseries.id[0])

['0001']

### Query

In [14]:
db_query = db.get_timeserie('Quintero', 'MP25', pd.to_datetime('2020-01-01'), pd.to_datetime('2022-01-01'))

In [99]:
db_query

Unnamed: 0,timestamp,MP25
0,2020-01-01 01:00:00,
1,2020-01-01 02:00:00,
2,2020-01-01 03:00:00,
3,2020-01-01 04:00:00,
4,2020-01-01 05:00:00,
...,...,...
739,2021-12-31 20:00:00,
740,2021-12-31 21:00:00,
741,2021-12-31 22:00:00,
742,2021-12-31 23:00:00,


## Pruebas

In [20]:
url = os.environ.get('AUTH_URL')
username = os.environ.get("USERNAME")
password = os.environ.get("PASSWORD")
s = requests.Session()
s.post(url, data=f'username={username}&password={password}')

<Response [200]>

In [24]:
# response = s.get('https://sinca.mma.gob.cl/api/domain/SMA/resolution/+/timeserie') # Timeseries por resolucion
response = s.get('https://sinca.mma.gob.cl/api/domain/SMA/station/540') # Detalles de estación

In [28]:
pd.DataFrame(response.json()['data'])

Unnamed: 0,id,name,xcoord,ycoord,geometry,timeserie
type,540,Quintero,263273,6369387,Point,
coordinates,540,Quintero,263273,6369387,"[0, 0]",
links,540,Quintero,263273,6369387,,{'self': 'https://sinca.mma.gob.cl/api/domain/...


In [20]:
maitenes_df = get_minma_data(params, 'maitenes', from_last=from_last)
maitenes_df.head()

  maitenes_df = get_minma_data(params, 'maitenes', from_last=from_last)


Unnamed: 0,Registros validados_SO2,Registros preliminares_SO2,Registros no validados_SO2,Registros validados_NO2,Registros preliminares_NO2,Registros no validados_NO2,Registros validados_NO,Registros preliminares_NO,Registros no validados_NO,Registros validados_NOX,Registros preliminares_NOX,Registros no validados_NOX,Registros validados_O3,Registros preliminares_O3,Registros no validados_O3,Registros validados_CO,Registros preliminares_CO,Registros no validados_CO
2017-09-06 18:00:00,,,3.22,,,2.16,,,2.01888,,,4.18258,,,21.0,,,0.22
2017-09-06 19:00:00,,,3.58,,,3.25,,,2.20725,,,5.46078,,,18.0,,,0.25
2017-09-06 20:00:00,,,7.39,,,13.01,,,2.41338,,,15.4239,,,9.0,,,0.29
2017-09-06 21:00:00,,,6.4,,,12.33,,,2.38637,,,14.72,,,10.0,,,0.28
2017-09-06 22:00:00,,,4.77,,,7.22,,,2.31295,,,9.535,,,13.0,,,0.27
