# Downloading DGS Covid-19 Daily Report

## Importing Libraries

In [1]:
# Importing Libraries
import requests
from scrapy import Selector
import pandas as pd
from os import path, makedirs

## Creating reports id and date dataframe

In [2]:
# Creating reports id and date dataframe

# Creating our dates "list"
dates = pd.date_range(start="03-03-2020", end="today")

# Creating our reports id list
report = []
for num in range(1, len(dates) + 1):
    if num < 10:
        report.append("00" + str(num))
    elif num < 100:
        report.append("0" + str(num))
    else:
        report.append(str(num))

# Creating a dictionary of our to list
report_dates = {"Report":report, "Dates":dates}

# creating a dataframe using the previous built dictionary
df_dates = pd.DataFrame(report_dates)

# Changing the Dates column values, from timeindex to str and converting them 
# to the most common Portuguese date format
df_dates["Dates"] = df_dates["Dates"].dt.strftime('%d-%m-%Y').apply(lambda x: x.replace("-", ""))

df_dates.head()

Unnamed: 0,Report,Dates
0,1,3032020
1,2,4032020
2,3,5032020
3,4,6032020
4,5,7032020


## Scraping DGS Daily Reports webpage

In [3]:
# Scraping DGS Daily Reports webpage
dgs_page = requests.get('https://covid19.min-saude.pt/relatorio-de-situacao/').content
sel = Selector(text=dgs_page)
links = sel.xpath("//div[@class='single_content']//a/@href")

# Reversing our selector list to match our Report and Dates Dataframe
links = links[::-1]

# Creating a list of URL addressess by using the extractor method from scrapy selector
# Our Range start at 5, because the first 5 links are for Informational Reports
urls = [links[link].extract() for link in range(5, len(links))]

# Removing an old link for the Report Nº 58 from our URL's list
urls.remove("https://covid19.min-saude.pt/wp-content/uploads/2020/04/58_DGS_boletim_20200429.pdf")

## Function for downloading DGS PDF's

In [4]:
# Function for downloading DGS Report PDF
def dgs_pdf_downloader(index, report, date, urls):
    
    '''
    
    Uses the dataframe data and URL list and downloads all the Daily DGS Covid-19 PDF Reports, that aren't 
    in the Dataset_Resources folder.
    
    '''
    # our Dataset directory
    my_dir = "Dataset_Resources"
    
    # Check if our directory exists, if not make it
    if not path.isdir(my_dir):
        makedirs(my_dir)
    
    # A loop to check one by one, if we already have that daily report pdf, if not download it
    for i in index:
        if path.exists(f"Dataset_Resources/dgs_covid19_report_{report[i]}_{date[i]}.pdf"):
            continue
        elif i >= len(urls):
            break
        else:
            url = urls[i]
            pdf = requests.get(url)
            open(f"Dataset_Resources/dgs_covid19_report_{report[i]}_{date[i]}.pdf", "wb").write(pdf.content)
            
    return None

## Dowloading the DGS PDF's

In [5]:
# Downloading the DGS PDF's
# Uses our previous built function to Download the DGS Daily report PDF's
dgs_pdf_downloader(df_dates.index, df_dates["Report"], df_dates["Dates"], urls)