# JalluPredix - Scrapper Module
### Overview
JalluPredix is an automatic system for real-time rainfall monitoring and prediction in Bolivia. It comprises two main parts: 

*   An IoT rain gauge built with Arduino and 3D printing
*   A webapp developed with MeanJS.

Since the device was initially meant to work in remote Bolivian towns and villages, the device has a GSM/GPRS component that sends the data every 24 hours. This collected data is processed with Python, Pandas, and Scikit-learn to make predictions using the ARIME model for every weather station. 

Data from public rain gauges takes an important role as the project aims to provide accurate predictions correlating data from different sources. 

### This file 
This notebook contains the code to scrape public data from the National Service of Meteorology and Hydrology in Bolivia [SENAMHI](https://www.senamhi.gob.bo/boletinmensual.php). Why scrapping? Mainly because the Bolivian government has still not released its environmental data in a friendly format. Currently, there are two ways to request this data: sending an email to SENAMHI, and using an old Flash app on their website that frequently breaks with new browsers. 

In [None]:
# Installs dependencies
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Hit:6 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Ign:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:11 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
chro

In [None]:
# Reading and scraping daily rainfall data per station
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

# Starts a chromium browser
browser.get("https://www.senamhi.gob.bo/boletinmensual.php")

# 
def find_options_by_name(dropdown, index):
  elem = browser.find_elements_by_name(dropdown)[index]
  list = []
  for option in elem.find_elements_by_tag_name('option'):
    list.append(option.text)
  return list

def set_selection(dropdown, index, option):
    elem = browser.find_elements_by_name(dropdown)[index]
    for choice in elem.find_elements_by_tag_name('option'):
        if choice.text == option:
            choice.click()

years = find_options_by_name('anio2',0)
months = find_options_by_name('mes2',0)
stations = find_options_by_name('estacion',0)

records = []

try: 
  for station in stations:
    for year in years:
      for month in months:
        if station != "Estación meteorológica": 
          set_selection('anio2',0,year)
          set_selection('mes2',0,month)
          set_selection('estacion',0,station)

          submit = browser.find_elements_by_name("Submit")[0]
          submit.click()

          table = browser.find_element_by_id("myTable")
          count = 1
          for row in table.find_elements_by_tag_name('tr'): 
            if count >= 3 and row.text != "Datos no disponibles!!":
              
              columns = row.find_elements_by_tag_name('td')
              if len(columns) >= 2 and columns[0].text != "Total acumulado:":
                date = columns[0].text;
                precipitation = columns[1].text;
                new_record = [station,date,precipitation]
                records.append(new_record)
              count += 1
            else:
              count += 1
              continue

          browser.back()

        else: 
          continue

  df = pd.DataFrame(records, columns=["station","date","precipitation"])
  df.to_csv('raining.csv', index=False)
  print("Task finished")
  #!ls -lh
except:
  print("An exception occurred")

  # Remove the CWD from sys.path while we load stuff.


An exception occurred


In [None]:
# Download the obtained csv file
from google.colab import files
files.download('raining.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Reading and scraping monthly temperature data per station
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
browser = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

# Starts a chromium browser
browser.get("https://www.senamhi.gob.bo/boletinmensual.php")

years = find_options_by_name('anio2',1)
months = find_options_by_name('mes2',1)
stations = find_options_by_name('estacion',1)

records = []

try: 
  for station in stations:
    for year in years:
      for month in months:
        if station != "Estación meteorológica": 
          set_selection('anio2',1,year)
          set_selection('mes2',1,month)
          set_selection('estacion',1,station)

          submit = browser.find_elements_by_name("Submit")[1]
          submit.click()

          table = browser.find_element_by_id("myTable")
          count = 1
          for row in table.find_elements_by_tag_name('tr'): 
            if count >= 3 and row.text != "Datos no disponibles!!":
              
              columns = row.find_elements_by_tag_name('td')
              if len(columns) >= 3 and columns[0].text != "Total acumulado:": # review this text
              
                date = columns[0].text
                min = columns[1].text
                max = columns[2].text

                new_record = [station,date,min,max]
                records.append(new_record)
              count += 1
            else:
              count += 1
              continue

          browser.back()

        else: 
          continue

  df = pd.DataFrame(records, columns=["station","date","min","max"])
  df.to_csv('temperature.csv', index=False)
  print("Task finished")
  #!ls -lh
except:
  print("An exception occurred")

In [None]:
# Download the obtained csv file
from google.colab import files
files.download('temperature.csv') 