## Install Dependencies

In [1]:
!pip install pdfminer

Collecting pdfminer
[?25l  Downloading https://files.pythonhosted.org/packages/71/a3/155c5cde5f9c0b1069043b2946a93f54a41fd72cc19c6c100f6f2f5bdc15/pdfminer-20191125.tar.gz (4.2MB)
[K     |████████████████████████████████| 4.2MB 2.5MB/s 
[?25hCollecting pycryptodome
[?25l  Downloading https://files.pythonhosted.org/packages/af/16/da16a22d47bac9bf9db39f3b9af74e8eeed8855c0df96be20b580ef92fff/pycryptodome-3.9.7-cp36-cp36m-manylinux1_x86_64.whl (13.7MB)
[K     |████████████████████████████████| 13.7MB 46.5MB/s 
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-cp36-none-any.whl size=6140077 sha256=bda9db467b974352dd796d7d4bc18083f1e2def355865a9ef723a8e2b36b4077
  Stored in directory: /root/.cache/pip/wheels/e1/00/af/720a55d74ba3615bb4709a3ded6dd71dc5370a586a0ff6f326
Successfully built pdfminer
Installing collected packages: pycryptodome, pdfminer
Successfully insta

In [2]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=a82db949a262a065d3a25935ffb5a88aec22e6e6b1f9cf1a0782caa57070374a
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


## PDF Mining

In [0]:
# reference: https://dzone.com/articles/exporting-data-from-pdfs-with-python

import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import re


def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()
            yield text
            # close open handles
            converter.close()
            fake_file_handle.close()
def extract_text(pdf_path):

    country = None
    state = None
    date = None
    data = {}
    # only for countries
    features = []

    for i,page in enumerate(extract_text_by_page(pdf_path)):
        #print(page)
        # page 1 and 2 is related to country
        if (i == 0 or i == 1):
          match = re.search(r'COVID-19 Community Mobility Report(.*) (\S+ \d+, \d+)', page)
          if (match):
            country = match.group(1)
            date = match.group(2)
            #print(country,date)
          match_features = re.search(r'Retail & recreation(\S+)(%.+Grocery & pharmacy)(\S+)(%.+Parks)(\S+)%', page)
          if (match_features):
            features.append(match_features.group(1))
            features.append(match_features.group(3))
            features.append(match_features.group(5))
          match_features = re.search(r'Transit stations(\S+)(%.+Workplaces)(\S+)(%.+Residential)(\S+)%', page)
          if (match_features):
            features.append(match_features.group(1))
            features.append(match_features.group(3))
            features.append(match_features.group(5))
            data[country] = features.copy()
            features.clear()
        # some states/departments/county of other countries have the string "Not enough data for this date"
        # that make the rows to bug some details of information, thus, only Brazil states are collect    
        elif country == "Brazil":
          # first state/department in the page    
          first = re.search(r'(.{5,30})Retail & recreation(\S+)(%.{15,25}Grocery & pharmacy)(\S+)(%.{15,25}Parks)(\S+)(%.{15,25}Transit stations)(\S+)(%.{15,25}Workplace)(\S+)(%.{15,25}Residential)(\S+)%', page)
          # second state/department in the page
          second = re.search(r'baseline(.{5,30})Retail & recreation(\S+)(%.{15,25}Grocery & pharmacy)(\S+)(%.{15,25}Parks)(\S+)(%.{15,25}Transit stations)(\S+)(%.{15,25}Workplace)(\S+)(%.{15,25}Residential)(\S+)%', page)

          if first:
            data[first.group(1)] = [first.group(2),first.group(4),first.group(6),first.group(8),first.group(10),first.group(12)]
          if second:
            data[second.group(1)] = [second.group(2),second.group(4),second.group(6),second.group(8),second.group(10),second.group(12)]
    return country, date, data

# Pre-Processing

In [0]:
all_links = []

In [0]:
# import package
import requests
from bs4 import BeautifulSoup

# specify the url
url = 'https://www.google.com/covid19/mobility/'

# packages the request, send the request and catch the response
response = requests.get(url)

# extract the content
content = response.content

parser = BeautifulSoup(content, 'html.parser')

# Get a list of all links.
a_tags = parser.find_all("a")
# Get the text
for link in a_tags:
  link_str = link.get('href')   
  # this was necessary because google list american county/state in separate files (lets eliminate them)
  ispdf = re.search(r'(.*)_(.{2})_Mobility(.*)pdf',link_str)
  if ispdf:
    all_links.append(ispdf.group(0))

In [0]:
all_links

In [65]:
import pandas as pd
import numpy as np
import time
import datetime

# Final Dataframe
df_final = pd.DataFrame()

time1 = time.time()
print('Collect begins at {}'.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
for i,link in enumerate(all_links):
  # Create a pdf file from the link
  pdfname = str(i)+".pdf" 
  open(pdfname, 'wb').write(requests.get(link).content)

  # Extract information
  country, date, data = extract_text(pdfname)

  # Steps to create dataframe
  # columns names
  columns = ["Retail & recreation","Grocery & pharmacy","Parks","Transit stations","Workplace","Residential"]
  
  # create dataframe
  df = pd.DataFrame.from_dict(data, orient='index', columns=columns)

  # convert all data to int32
  for col in columns:
    df[col] = df[col].astype(np.int32)

  # create column to identify the country
  df["Country"] = country

  # Concatenate temporary dataframe for the final ones
  df_final = pd.concat([df_final, df])

print('Collect ends at {}'.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
print('Duration: {}s'.format(time.time() - time1))

Collect begins at 2020-04-04 17:42:05
Collect ends at 2020-04-04 17:44:10
Duration: 125.05038547515869s


In [71]:
df_final.head(5)

Unnamed: 0,Retail & recreation,Grocery & pharmacy,Parks,Transit stations,Workplace,Residential,Country
Afghanistan,-38,-21,-13,-34,-33,10,Afghanistan
Angola,-61,-40,-39,-57,-11,22,Angola
Argentina,-86,-61,-89,-80,-57,27,Argentina
Aruba,-88,-66,-80,-88,-72,20,Aruba
Australia,-45,-19,-35,-58,-33,13,Australia


In [72]:
print(date)

March 29, 2020


## Generate the final dataset

In [0]:
df_final.to_csv(date + ".csv")