In [1]:
# Listing 7-1: creating a new database in Athena

import boto3
import numpy as np
import pandas as pd

def run_query(query, database, s3_output):
    client = boto3.client('athena', region_name='us-east-1')
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
            },
        ResultConfiguration={
            'OutputLocation': s3_output,
            }
        )
    print('Execution ID: ' + response['QueryExecutionId'])
    return response['QueryExecutionId']

In [7]:
# Listing 7-1: creating a new database in Athena (cont.)

query = '''Create database domainranks2'''

database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: 5f030e8e-7fd7-4487-b388-ef61cb6b7502


In [8]:
# Listing 7-2: creating a new table
query = '''CREATE EXTERNAL TABLE IF NOT EXISTS domainranks2.domain_ranks (
`#harmonicc_pos` bigint,
`#harmonicc_val` double,
`#pr_pos` bigint,
`#pr_val` double,
`#host_rev` string,
`#n_hosts` int)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
 'serialization.format' = '	',
 'field.delim' = '	',
 'collection.delim' = '#',
 'mapkey.delim' = '#'
) LOCATION 's3://athena-us-east-1-testing/domain_ranks/'
TBLPROPERTIES ('has_encrypted_data'='false')'''


database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: 5d41677a-3592-42e6-b250-c995987e5770


In [15]:
# Listing 7-3: querying for ranks for theguardian.com
query = '''SELECT * FROM domainranks2.domain_ranks where domain_ranks."#host_rev" = 'com.theguardian';'''
database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: caa1997d-3625-4598-920b-6f3058ecc742


In [11]:
# Listing 7-4: Fetching query results from Athena

def get_raw_response(execution_id):
    
    client = boto3.client('athena', region_name='us-east-1')
    response = client.get_query_results(
        QueryExecutionId=execution_id,
        MaxResults=123
    )
    return response

def results_to_df(results):

    columns = [
        col['Label'] for col in results['ResultSet']['ResultSetMetadata']['ColumnInfo']]

    listed_results = []
    for res in results['ResultSet']['Rows'][1:]:
        values = []
        for field in res['Data']:
            try:
                values.append(list(field.values())[0]) 
            except:
                values.append(list(' '))

        listed_results.append(
            dict(zip(columns, values))
        )

    return listed_results

In [21]:
# Listing 7-4: Fetching query results from Athena


import time
for i in range(15):
    time.sleep(10)

    try:
        return_json = get_raw_response(execution_id)
        t = results_to_df(return_json)
        df_2 = pd.DataFrame(t)
        print("query successful")
        break
    except Exception as e:

        print(e)
        
        pass

query successful


In [22]:
# Listing 7-4: Fetching query results from Athena


df_2.head()

Unnamed: 0,#harmonicc_pos,#harmonicc_val,#host_rev,#n_hosts,#pr_pos,#pr_val
0,92,20435550.0,com.theguardian,127,171,0.00014701598631846095


In [31]:
# Listing 7-5: Creating additional table with just inverted urls

df = pd.read_csv("inverted_urls_list.csv")
df.head()

Unnamed: 0,urls,inverted_urls
0,facebook.com,com.facebook
1,google.com,com.google
2,youtube.com,com.youtube
3,twitter.com,com.twitter
4,instagram.com,com.instagram


In [25]:
# Listing 7-6: Creating additional table inverted urls

query = '''CREATE EXTERNAL TABLE IF NOT EXISTS urltest (
 `urls` STRING,
 `inverted_urls` STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' 
WITH SERDEPROPERTIES ( 
  'escapeChar'='\\\\', 
  'separatorChar'=','
  ) LOCATION 's3://athena-us-east-1-testing/sample-folder/' 
TBLPROPERTIES ('has_encrypted_data'='false', 'skip.header.line.count' = '1');'''

database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)
# https://stackoverflow.com/questions/40912086/amazon-athena-no-viable-alternative-at-input

Execution ID: 354f96ce-8f10-4dcf-b8d7-fc5e8d0a94ff


In [26]:
# Listing 7-7: Testing new Athena table

query = '''select * from domainranks2.urltest'''
database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: 6225c27f-a8e6-4468-83c5-5dbe5a56139a


In [27]:
# Listing 7-7: Testing new Athena table


return_json = get_raw_response(execution_id)
t = results_to_df(return_json)
df_2 = pd.DataFrame(t)
df_2.head()

Unnamed: 0,inverted_urls,urls
0,com.facebook,facebook.com
1,com.google,google.com
2,com.youtube,youtube.com
3,com.twitter,twitter.com
4,com.instagram,instagram.com


In [28]:
# Listing 7-8: fetching harmonic centrality and pagerank rankings

query = '''select urltest."inverted_urls",urltest."urls", domain_ranks."#harmonicc_pos", domain_ranks."#harmonicc_val", domain_ranks."#n_hosts", domain_ranks."#pr_pos", domain_ranks."#pr_val"
FROM urltest, domain_ranks
WHERE domain_ranks."#host_rev" = urltest."inverted_urls"'''
database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: 675d2d2e-3597-4df8-a0b1-b2fee64748c1


In [29]:
# Listing 7-8: fetching harmonic centrality and pagerank rankings


import time
for i in range(15):
    time.sleep(10)

    try:
        return_json = get_raw_response(execution_id)
        t = results_to_df(return_json)
        df_2 = pd.DataFrame(t)
        print("query successful")
        break
    except Exception as e:

        print(e)
        
        pass

An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query has not yet finished. Current state: RUNNING
An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query has not yet finished. Current state: RUNNING
An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query has not yet finished. Current state: RUNNING
An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query has not yet finished. Current state: RUNNING
An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query has not yet finished. Current state: RUNNING
An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query has not yet finished. Current state: RUNNING
An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query has not yet finished. Current state: RUNNING
An error occurred (InvalidR

In [30]:
# Listing 7-8: fetching harmonic centrality and pagerank rankings


df_2.head()

Unnamed: 0,#harmonicc_pos,#harmonicc_val,#n_hosts,#pr_pos,#pr_val,inverted_urls,urls
0,24,21533280.0,100,31,0.0011040749131266,com.vimeo,vimeo.com
1,80,20488868.0,5116,373,6.71641116389417e-05,com.msn,msn.com
2,67,20609614.0,464,154,0.0001692129080835582,com.bing,bing.com
3,236490,16137525.0,17,116,0.000257561600845181,com.cpanel,cpanel.com
4,146,20222314.0,123,339,7.442682626049672e-05,com.time,time.com


In [32]:
# Listing 7-9: Setting up CC index Athena table

query = '''CREATE EXTERNAL TABLE IF NOT EXISTS ccindex (
  `url_surtkey`                   STRING,
  `url`                           STRING,
  `url_host_name`                 STRING,
  `url_host_tld`                  STRING,
  `url_host_2nd_last_part`        STRING,
  `url_host_3rd_last_part`        STRING,
  `url_host_4th_last_part`        STRING,
  `url_host_5th_last_part`        STRING,
  `url_host_registry_suffix`      STRING,
  `url_host_registered_domain`    STRING,
  `url_host_private_suffix`       STRING,
  `url_host_private_domain`       STRING,
  `url_protocol`                  STRING,
  `url_port`                      INT,
  `url_path`                      STRING,
  `url_query`                     STRING,
  `fetch_time`                    TIMESTAMP,
  `fetch_status`                  SMALLINT,
  `fetch_redirect`                STRING,
  `content_digest`                STRING,
  `content_mime_type`             STRING,
  `content_mime_detected`         STRING,
  `content_charset`               STRING,
  `content_languages`             STRING,
  `content_truncated`             STRING,
  `warc_filename`                 STRING,
  `warc_record_offset`            INT,
  `warc_record_length`            INT,
  `warc_segment`                  STRING)
PARTITIONED BY (
  `crawl`                         STRING,
  `subset`                        STRING)
STORED AS parquet
LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/';'''

database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: 624e854f-26fb-4209-b474-c101a829e5e8


In [33]:
# Listing 7-10: table repair query

query = '''MSCK REPAIR TABLE ccindex;'''

database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: 3e93d268-7b8e-4eab-a6f8-1fe9503684ca


In [37]:
# Listing 7-11: sample query on cc-index

query = '''SELECT url_surtkey, url, warc_filename, warc_record_offset, warc_record_length,
content_mime_detected, fetch_status, content_languages
FROM domainranks2.ccindex
WHERE crawl = 'CC-MAIN-2020-24'
AND url LIKE 'http%://www.cnn.com/'
AND subset = 'warc'
AND url_host_registered_domain = 'cnn.com'
LIMIT 5;'''

database = 'domainranks2'
s3_output = 's3://athena-us-east-1-testing/query-folder2/'
execution_id = run_query(query, database, s3_output)

Execution ID: 22d233a9-a2e5-4543-83da-81314dd9d256


In [38]:
# Listing 7-11: sample query on cc-index


import time
for i in range(15):
    time.sleep(10)

    try:
        return_json = get_raw_response(execution_id)
        t = results_to_df(return_json)
        df_2 = pd.DataFrame(t)
        print("query successful")
        break
    except Exception as e:

        print(e)
        
        pass
df_2.head()

query successful


Unnamed: 0,content_languages,content_mime_detected,fetch_status,url,url_surtkey,warc_filename,warc_record_length,warc_record_offset
0,eng,text/html,200,https://www.cnn.com/,"com,cnn)/",crawl-data/CC-MAIN-2020-24/segments/1590347387...,143892,693352534
1,eng,text/html,200,https://www.cnn.com/,"com,cnn)/",crawl-data/CC-MAIN-2020-24/segments/1590347387...,144002,676690049
2,eng,text/html,200,https://www.cnn.com/,"com,cnn)/",crawl-data/CC-MAIN-2020-24/segments/1590347388...,144422,697305859
3,eng,text/html,200,https://www.cnn.com/,"com,cnn)/",crawl-data/CC-MAIN-2020-24/segments/1590347388...,144466,647676538
4,eng,text/html,200,https://www.cnn.com/,"com,cnn)/",crawl-data/CC-MAIN-2020-24/segments/1590347389...,144118,680989734


In [4]:
# extruct page demo (not shown in text)
import requests
import pprint

url = 'https://www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms'
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + ' (KHTML, like Gecko) Chrome/61.0.3163.100Safari/537.36'
}


r = requests.get(url, headers = my_headers)


html_response = r.text

import extruct
data = extruct.extract(r.text)
data_keys = data.keys()
for key in data_keys:
    print("*"*10)
    print(key)
    pprint.pprint(data[key])
    

**********
microdata
[{'type': 'http://schema.org/WebPage',
  'value': 'Skip to main content\n'
           'The Guardian - Back to home\n'
           'Support The Guardian\n'
           'Available for everyone, funded by readers\n'
           'Contribute Subscribe Contribute\n'
           'Search jobs\n'
           'Sign in My account\n'
           '\n'
           'Account overview\n'
           'Profile\n'
           'Emails & marketing\n'
           'Settings\n'
           'Help\n'
           'Comments & replies\n'
           'Sign out\n'
           '\n'
           'Search\n'
           '\n'
           'switch to the International edition\n'
           'switch to the UK edition\n'
           'switch to the US edition\n'
           'switch to the Australia edition\n'
           '\n'
           'current edition: International edition\n'
           '\n'
           'News\n'
           'Opinion\n'
           'Sport\n'
           'Culture\n'
           'Lifestyle\n'
           '\n'
       

In [24]:
# Listing 7-12: Microdata example
import requests

url = 'https://www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms'
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + ' (KHTML, like Gecko) Chrome/61.0.3163.100Safari/537.36'
}

r = requests.get(url, headers = my_headers)
html_response = r.text

data = extruct.extract(html_response, syntaxes = ['microdata'])
data_keys = data["microdata"][1]["properties"].keys()
for key in data_keys:
    print("*"*10)
    print(key)
    print(data["microdata"][1]["properties"][key])

**********
mainEntityOfPage
https://www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms
**********
publisher
{'type': 'https://schema.org/Organization', 'properties': {'name': 'The Guardian', 'logo': {'type': 'https://schema.org/ImageObject', 'properties': {'url': 'https://uploads.guim.co.uk/2018/01/31/TheGuardian_AMP.png', 'width': '190', 'height': '60'}}}}
**********
headline
Waitrose to launch charm offensive as Ocado switches to M&S
**********
description
Supermarket will launch thousands of new and revamped products aiming to retain online customers
**********
author
{'type': 'http://schema.org/Person', 'properties': {'sameAs': 'https://www.theguardian.com/profile/zoewood', 'name': 'Zoe Wood'}}
**********
datePublished
2020-02-10T06:00:27+0000
**********
dateModified
['2020-02-12T13:42:07+0000', '2020-02-12T13:42:07+0000']
**********
associatedMedia
{'type': 'http://schema.org/ImageObject', 'properties': {'representativeOfPage': 'tru

In [27]:
# Listing 7-13: json-ld example

data = extruct.extract(html_response, syntaxes = ['json-ld'])
print(data)

{'json-ld': [{'@context': 'http://schema.org',
   '@type': 'Organization',
   'logo': {'@type': 'ImageObject',
    'height': 60,
    'url': 'https://uploads.guim.co.uk/2018/01/31/TheGuardian_AMP.png',
    'width': 190},
   'name': 'The Guardian',
   'sameAs': ['https://www.facebook.com/theguardian',
    'https://twitter.com/guardian',
    'https://www.youtube.com/user/TheGuardian'],
   'url': 'http://www.theguardian.com/'},
  {'@context': 'http://schema.org',
   '@id': 'https://www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms',
   '@type': 'WebPage',
   'potentialAction': {'@type': 'ViewAction',
    'target': 'android-app://com.guardian/https/www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms'}}]}

In [28]:
# Listing 7-14: opengraph example

data = extruct.extract(html_response, syntaxes = ['opengraph'])
print(data)

{'opengraph': [{'namespace': {'article': 'http://ogp.me/ns/article#',
    'og': 'http://ogp.me/ns#'},
   'properties': [('og:url',
     'http://www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms'),
    ('article:author', 'https://www.theguardian.com/profile/zoewood'),
    ('og:image:height', '720'),
    ('og:description',
     'Supermarket will launch thousands of new and revamped products aiming to retain online customers'),
    ('og:image:width', '1200'),
    ('og:image',
     'https://i.guim.co.uk/img/media/65d537a07a3493f18eef074ac0910e6c768d5f2c/0_58_3500_2100/master/3500.jpg?width=1200&height=630&quality=85&auto=format&fit=crop&overlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc&enable=upscale&s=9719e60266c3af3c231324b6969a0c84'),
    ('article:publisher', 'https://www.facebook.com/theguardian'),
    ('og:type', 'article'),
    ('article:section', 'Business'),
    ('article:

In [30]:
# Listing 7-15: Extracting dates from url
import re
from dateutil.parser import parse as date_parser

def extract_dates(url):
    
    def parse_date_str(date_str):
        if date_str:
            try:
                return_value = date_parser(date_str)
                if pd.isnull(return_value) is True:
                    return 'None'
                else:
                    return return_value
            except (ValueError, OverflowError, AttributeError, TypeError):
               
                return 'None'

    _STRICT_DATE_REGEX_PREFIX = r'(?<=\W)'
    DATE_REGEX = r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?'
    STRICT_DATE_REGEX = _STRICT_DATE_REGEX_PREFIX + DATE_REGEX
    
    date_match = re.search(STRICT_DATE_REGEX, url)
    
    if date_match is not None:
        
        return parse_date_str(date_match.group(0))
    else:
        return 'None'
url = 'https://www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms'

print(extract_dates(url))

2020-02-10 00:00:00


In [10]:
#Listing 7-16
import json
from newspaper import Article
import numpy as np
import pandas as pd
import requests


url = 'https://www.theguardian.com/business/2020/feb/10/waitrose-to-launch-charm-offensive-as-ocado-switches-to-ms'

my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + ' (KHTML, like Gecko) Chrome/61.0.3163.100Safari/537.36'
}


r = requests.get(url, headers = my_headers)


html_response = r.text

def newspaper_parse(html):

    article = Article('')
    article.set_html(html)
    article.download()
    
    article_title = None
    json_authors = None
    article_text = None
    article_publish_date = None
    
    try:
    
        article.parse()
    
        json_authors = json.dumps(article.authors)
        
        article_title = article.title
        article_text = article.text
        article_publish_date = article.publish_date
        
    except:
        
        pass
    
    return article_title, json_authors, article_text, article_publish_date

article_title, json_authors, article_text, article_publish_date = newspaper_parse(html_response)

In [12]:
#Listing 7-16

print(article_title)
print("*"*10)
print(article_publish_date)
print("*"*10)
print(json_authors)
print("*"*10)
print(article_text)

Waitrose to launch charm offensive as Ocado switches to M&S
**********
2020-02-10 06:00:27+00:00
**********
["Zoe Wood"]
**********
Waitrose is to launch thousands of new and revamped products in the coming months as the battle for the hearts and minds of Ocado shoppers moves up a gear.

The supermarket’s deal with the online grocer will finish at the end of August, when it will be replaced by Marks & Spencer. The switchover is high risk for all the brands involved: Ocado risks losing loyal Waitrose shoppers while the supermarket, which is part of the John Lewis Partnership, will have to persuade shoppers to use its own website instead.

Last year, Ocado fired the opening salvo stating its product range would be bigger, cheaper and better quality under the M&S deal. The online grocer will stock 6,000 M&S products, compared with the 4,000 it sells as part of its supply deal with Waitrose. The alternatives would be the “same price or lower, and of the same quality or better” than the Wai

In [2]:
# Listing 7-17: scraping from muckrack's sitemap
import numpy as np
import pandas as pd
import requests

url = 'https://muckrack.com/sitemap.xml'
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + ' (KHTML, like Gecko) Chrome/61.0.3163.100Safari/537.36'
}

r = requests.get(url=url, headers = my_headers)


In [41]:
# Listing 7-17: scraping from muckrack's sitemap (cont.)

sitemaps = soup.find_all('sitemap')
for sitemap in sitemaps:
    print(sitemap.find('loc').get_text())
    try:
        print(sitemap.find('lastmod').get_text())
        print(sitemap.find('changefreq').get_text())
    except:
        pass


https://muckrack.com/sitemaps/sitemap-pages-1.xml
https://muckrack.com/sitemaps/sitemap-mrdaily-1.xml
2020-08-18T12:01:04-04:00
https://muckrack.com/sitemaps/sitemap-mrdaily-2.xml
2016-09-01T17:01:26-04:00
https://muckrack.com/sitemaps/sitemap-mrdaily-3.xml
2014-02-19T10:28:15-05:00
https://muckrack.com/sitemaps/sitemap-blog-1.xml
2020-08-18T06:00:00-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-1.xml
2020-08-18T08:57:12-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-2.xml
2020-08-18T17:54:52-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-3.xml
2020-08-18T20:35:23-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-4.xml
2020-08-18T14:29:40-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-5.xml
2020-08-18T19:14:41-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-6.xml
2020-08-18T08:09:16-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-7.xml
2020-08-18T22:29:03-04:00
https://muckrack.com/sitemaps/sitemap-media_outlets-8

In [26]:
# Listing 7-18: loading person and media outlet sitemaps into separate lists
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'xml')
sitemap_other = []
sitemap_media = []
sitemap_persons = []

sitemaps = soup.find_all('loc')
for sitemap in sitemaps:
    sitemap = sitemap.get_text()
    if 'media' in sitemap:
        sitemap_media.append(sitemap)
    elif 'person' in sitemap:
        sitemap_persons.append(sitemap)
    else:
        sitemap_other.append(sitemap)
print(len(sitemap_media))
print(len(sitemap_persons))

12
67


In [None]:
# Listing  7-19: fetching muckrack profiles url from sitemap list
import time
temp_list = []
for sitemap_media_url in sitemap_media:
    time.sleep(5)
    r = requests.get(url = sitemap_media_url, headers = my_headers)
    soup = BeautifulSoup(r.text, 'xml')
    sitemaps = soup.find_all('url')
    for sitemap in sitemaps:
        
        temp_dict = {}
        temp_dict['url'] = sitemap.find('loc').get_text()
        
        try:
            last_modified = sitemap.find('lastmod').get_text()
           
        except:
            last_modified = ''
           
            
        temp_dict["last_modified"] = last_modified
        temp_list.append(temp_dict)
import pandas as pd
import numpy as np

df = pd.DataFrame(temp_list)
df.head()
df.to_csv("muckrack_media_fetchlist.csv")

In [124]:
# Listing 7-20: Parsing media profiles from muckrack.com
import json
import random
import time

def parse_muckrack_media(sitemap_df, number_of_pages):
    final_list = []
    random_int_list = []
    
    for i in range(number_of_pages):
        random_int_list.append(random.randint(0, len(df)))
        
        
    while len(random_int_list) != 0:
        url_index = random_int_list.pop()
        url = sitemap_df.url.iloc[url_index]
        time.sleep(5)
        r = requests.get(url = url, headers = my_headers)
        html_source = r.text
        soup = BeautifulSoup(html_source, 'html.parser')

        temp_dict = {}
        temp_dict["muckrack_profile_url"] = url

        try:
            temp_dict["source_name"] = soup.find('h1', {'class': "mr-font-family-2 top-none bottom-xs"}).get_text()
        except:
            temp_dict["source_name"] = ''

        try:
            temp_dict["description"] = soup.find('div', {'class', 'top-xs'}).get_text()
        except:
            temp_dict["description"] = ''

        try:
            temp_dict["media_type"] = soup.find('div',{'class':'mr-font-weight-semibold'}).get_text()
        except:
            temp_dict["media_type"] = ''

        try:
            temp_dict["url"] = soup.find('div', {'class' : 'mr-contact-item-inner '}).get_text()
        except:
            temp_dict["url"] = ''
        try:
            temp_dict["twitter"] = soup.find('a',{'class', 'mr-contact break-word top-xs js-icon-twitter mr-contact-icon-only'})['href']
        except:
            temp_dict["twitter"] = ''
        try:
            temp_dict["linkedin"] = soup.find('a', {'class', 'mr-contact break-word top-xs js-icon-linkedin mr-contact-icon-only'})['href']
        except:
            temp_dict["linkedin"] = ''
        try:
            temp_dict['facebook'] = soup.find('a', {'class', 'mr-contact break-word top-xs js-icon-facebook mr-contact-icon-only'})['href']
        except:
            temp_dict['facebook'] = ''
        try:
            temp_dict['youtube'] = soup.find('a', {'class', 'mr-contact break-word top-xs js-icon-youtube-play mr-contact-icon-only'})['href']
        except:
            temp_dict['youtube'] = ''
        try:
            temp_dict['Pinterest'] = soup.find('a', {'class', 'mr-contact break-word top-xs js-icon-pinterest mr-contact-icon-only'})['href']
        except:
            temp_dict['Pinterest'] = ''
        try:
            temp_dict['Instagram'] = soup.find('a', {'class', 'mr-contact break-word top-xs js-icon-instagram mr-contact-icon-only'})['href']
        except:
            temp_dict['Instagram'] = ''

        for tr in soup.find_all('tr'):
            tds = tr.find_all('td')
            th = tr.find_all('th')
            try:
                temp_dict[th[0].get_text().strip()] = tds[0].get_text().strip()
            except:
                pass
        jr_list = []

        bottom_section = soup.find_all("div", {'class', 'row bottom-sm'})

        rows = soup.find_all('div', {'class', 'mr-directory-item'})
        jr_list = []
        for row in rows:
            if row is not None:
                jr_dict = {}
                jr_dict["name"] = row.get_text().strip()
                jr_dict["profile_url"] = 'https://muckrack.com'+row.find('a')["href"]
                jr_list.append(jr_dict)
        temp_dict["journalists"] = json.dumps(jr_list)
        final_list.append(temp_dict)
    return final_list

In [125]:
# Listing 7-20: Parsing media profiles from muckrack.com

sample_list = parse_muckrack_media(df, 5)
df_sample.to_csv("muckrack_media.csv", index = False)
df_sample = pd.DataFrame(sample_list)
df_sample.head()

Unnamed: 0,Country,Days Published,Frequency,Instagram,Language,Pinterest,Scope,UVM Insights by,description,facebook,journalists,linkedin,media_type,muckrack_profile_url,source_name,twitter,url,youtube
0,United Kingdom,,,https://instagram.com/vivamagazines,English,,Local,Request pricing,VIVA Magazine is a exciting addition to the wo...,https://www.facebook.com/vivamagazines/,"[{""name"": ""Bones, Lauren"", ""profile_url"": ""htt...",https://www.linkedin.com/company/viva-lifestyl...,Magazine,https://muckrack.com/media-outlet/vivamanchester,Viva Magazine,https://twitter.com/vivamagazines,http://vivamanchester.co.uk/,https://www.youtube.com/user/vivamagazines
1,Spain,"Mon, \nTue, \nWed, \nThu, \nFri, \nSat, \nSun",Daily,,Spanish,,Local,Request pricing,"The news must be told as always, on paper, and...",https://www.facebook.com/latribunacuenca/?ref=...,[],,Newspaper,https://muckrack.com/media-outlet/latribunadec...,La Tribuna de Cuenca,https://twitter.com/tribunadecuenca,http://latribunadecuenca.es,
2,Mexico,,,https://instagram.com/must_magazine,Spanish,,National,Request pricing,,https://www.facebook.com/MustTechStyle,[],,Magazine,https://muckrack.com/media-outlet/must,MUST Tech & Style,https://twitter.com/MUST_MAGAZINE,http://must.com.mx/,
3,United States of America,,Biyearly,https://www.instagram.com/cakeboymag/,English,,"International, \n \n ...",Request pricing,Cakeboy magazine is a bi-annual print publicat...,https://www.facebook.com/CakeboyMag/,[],,\n \n ...,https://muckrack.com/media-outlet/cakeboymag,Cakeboy Magazine,https://twitter.com/cakeboymag,http://cakeboymag.com,
4,United States of America,Thu,Weekly,,English,,Consumer,Request pricing,We are such big fans of the Fixer Upper TV Sho...,https://www.facebook.com/groups/FixerUpperPodc...,[],,\n \n ...,https://muckrack.com/media-outlet/FixerUpperPo...,The Fixer Upper Podcast,https://twitter.com/FixerUpperPod,http://FixerUpperPodcast.com,


In [14]:
# Listing 7-21: creating a SQS queue 

import boto3
import json
import sys
import time

def CreateQueue(topic_name):
    
        sqs = boto3.client('sqs',  region_name = 'us-east-1')
        
        millis = str(int(round(time.time() * 1000)))

        
        #create SQS queue
        sqsQueueName=topic_name + millis
        sqs.create_queue(QueueName=sqsQueueName)
        sqsQueueUrl = sqs.get_queue_url(QueueName=sqsQueueName)['QueueUrl']
 
        attribs = sqs.get_queue_attributes(QueueUrl=sqsQueueUrl,
                                                    AttributeNames=['QueueArn'])['Attributes']
                                        
        sqsQueueArn = attribs['QueueArn']
    
        return({"sqsQueueArn":sqsQueueArn,"sqsQueueUrl":sqsQueueUrl})

response_dict = CreateQueue("cc-news-daily")

In [15]:
response_dict

{'sqsQueueArn': 'arn:aws:sqs:us-east-1:896493407642:cc-news-daily1597659958131',
 'sqsQueueUrl': 'https://queue.amazonaws.com/896493407642/cc-news-daily1597659958131'}

In [1]:
# Listing 7-22: fetching json through cc-index api

import urllib

def get_index_url(query_url):

    query = urllib.parse.quote_plus(query_url)
    base_url = 'https://index.commoncrawl.org/CC-MAIN-2020-16-index?url='
    index_url = base_url + query + '&output=json'
    return index_url
query_url = 'theguardian.com/*'
index_url = get_index_url(query_url)

import re
import time
import gzip
import json
import requests
try:
    from io import BytesIO
except:
    from StringIO import StringIO
def get_index_json(index_url):
    pages_list = []
    
    
    for i in range(4):
        resp = requests.get(index_url)
        print(resp.status_code)

        time.sleep(0.2)

        if resp.status_code == 200:
            for x in resp.content.strip().decode().split('\n'):
                page = json.loads(x)
                
                try:
                    if page['status'] == '200':
                        pages_list.append(page)

                except:
                    pass
            
            break
    return pages_list


index_json = get_index_json(index_url)
print(len(index_json))

200
7107


In [17]:
index_json[:2]

[{'charset': 'UTF-8',
  'digest': 'QBTLGLXQTDGLQLVM7A4LU3DTRFG3R2VI',
  'filename': 'crawl-data/CC-MAIN-2020-16/segments/1585370506673.7/warc/CC-MAIN-20200402045741-20200402075741-00316.warc.gz',
  'languages': 'eng',
  'length': '76786',
  'mime': 'text/html',
  'mime-detected': 'text/html',
  'offset': '1172149968',
  'status': '200',
  'timestamp': '20200402071759',
  'url': 'https://www.theguardian.com/about',
  'urlkey': 'com,theguardian)/about'},
 {'charset': 'UTF-8',
  'digest': 'NLFT46TQ2MZ6ATH2P7I4YG6PPKAU4YST',
  'filename': 'crawl-data/CC-MAIN-2020-16/segments/1585371700247.99/warc/CC-MAIN-20200407085717-20200407120217-00485.warc.gz',
  'languages': 'eng',
  'length': '105122',
  'mime': 'text/html',
  'mime-detected': 'text/html',
  'offset': '1173563705',
  'status': '200',
  'timestamp': '20200407112300',
  'url': 'https://www.theguardian.com/activate/video-interview-with-rose-shuman-founder-question-box',
  'urlkey': 'com,theguardian)/activate/video-interview-with-rose-s

In [None]:
# loading the SQS queue

In [13]:
# Listing 7-23: loading messages on sqs

response_dict = {'sqsQueueArn': 'arn:aws:sqs:us-east-1:896493407642:cc-news-daily1597659958131',
 'sqsQueueUrl': 'https://queue.amazonaws.com/896493407642/cc-news-daily1597659958131'}

import boto3
import json
from datetime import datetime
def myconverter(o):
    if isinstance(o, datetime):
        return o.__str__()
# Create SQS client
sqs = boto3.client('sqs',region_name = 'us-east-1')

queue_url = response_dict["sqsQueueUrl"]
for line in index_json[:5]:
    payload = json.dumps(line, default = myconverter)
    # Send message to SQS queue
    response = sqs.send_message(
        QueueUrl=queue_url,
        DelaySeconds=10,
        MessageAttributes={

        },
        MessageBody=(
            payload
        )
    )

    print(response['MessageId'])

654633cc-a901-42a5-a42c-4700ef6091ce
9aafeb1b-c3ef-4bd8-982d-601cdf7c6f67
33cef889-4622-4b42-b7aa-d7503f558665
38a2c47e-b328-4edc-b860-47b898c9b1ef
57f5594f-b638-4f4e-a427-179c9d50e948


In [14]:
# Listing 7-24: Worker node script download and parse from S3 bucket
import json
from newspaper import Article
import numpy as np
import pandas as pd

def newspaper_parse(html):

    article = Article('')
    article.set_html(html)
    article.download()
    
    article_title = None
    json_authors = None
    article_text = None
    article_publish_date = None
    
    try:
    
        article.parse()
    
        json_authors = json.dumps(article.authors)
        
        article_title = article.title
        article_text = article.text
        article_publish_date = article.publish_date
        
    except:
        
        pass
    
    return  article_title, json_authors, article_text, article_publish_date

def get_html_from_cc_index(page):
    
    offset, length = int(page['offset']), int(page['length'])
    offset_end = offset + length - 1
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    temp_list = []

    try:

        resp2 = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
    
        raw_data = BytesIO(resp2.content)
        f = gzip.GzipFile(fileobj=raw_data)
        data = f.read()
    except:

        print('some error in connection?')

    try:

        temp_dict = {}
        warc, header, response = data.strip().decode().split('\r\n\r\n', 2)
        temp_dict["article_title"], authors_list, temp_dict["article_text"], temp_dict["article_publish_date"] = newspaper_parse(response)
        temp_dict["url"] = page["url"]
        authors_list = json.loads(authors_list)
        if len(authors_list) == 0:
            temp_dict["author"] = ''
            temp_list.append(temp_dict)

        else:
            for author in authors_list:

                temp_dict["author"] = author


    except Exception as e:
        pass
        print(e)
    
    return temp_dict


In [15]:
# Listing 7-25: Iterating through the SQS queue

import pandas as pd
import numpy as np
import os
import uuid

def upload_to_s3(final_list, S3_bucket_name):
    
    local_filename = str(uuid.uuid4()) + '.csv'
    
    df = pd.DataFrame(final_list)
    df.to_csv(local_filename, index = False)
    
    s3 = boto3.client('s3',region_name = 'us-east-1')
    
    for attempt in range(1,6):
        try:
            # files automatically and upload parts in parallel.
            s3.upload_file(local_filename,S3_bucket_name, local_filename)
            
        except Exception as e:
            print(str(e))
        else:
            print("finished uploading to s3 in attempt ", attempt)
            break
            
    os.remove(local_filename)

final_list = []
while True:

    sqs = boto3.client('sqs',  region_name = 'us-east-1')


    try:
        sqsResponse = sqs.receive_message(QueueUrl=response_dict['sqsQueueUrl'], MessageAttributeNames=['ALL'],
                                                  MaxNumberOfMessages=1, WaitTimeSeconds = 10)


        page = json.loads(sqsResponse["Messages"][0]["Body"])

        receipt_handle = sqsResponse["Messages"][0]["ReceiptHandle"]
        response = sqs.delete_message(QueueUrl=response_dict['sqsQueueUrl'], ReceiptHandle=receipt_handle)

        final_list.append(get_html_from_cc_index(page))
        
        if len(final_list) == 1000:
            upload_to_s3(final_list, 'ec2-testing-for-s3-permissions')
            final_list = []

    except Exception as E:
        print('no more messages to fetch')
        upload_to_s3(final_list, 'ec2-testing-for-s3-permissions')
        break

no more messages to fetch
finished uploading to s3 in attempt  1


In [16]:
# Listing 7-26: Parsed content from theguardian.com
df_responses = pd.DataFrame(final_list)
df_responses.head()

Unnamed: 0,article_publish_date,article_text,article_title,author,url
0,NaT,Katharine Viner is editor-in-chief of the Guar...,About the Guardian,,https://www.theguardian.com/about
1,2010-06-15 09:39:34+00:00,What term do you want to search? Search with g...,"Video interview with Rose Shuman, founder, Ope...",,https://www.theguardian.com/activate/video-int...
2,2011-05-27 09:24:00+00:00,What term do you want to search? Search with g...,Activate New York: Rose Shuman - video,,https://www.theguardian.com/activate/video/act...
3,NaT,"US Climate Alliance, Climate Mayors, We Are St...",Advertiser content hosted by the Guardian: The...,,https://www.theguardian.com/advertiser-content...
4,2010-06-15 09:39:34+00:00,What term do you want to search? Search with g...,"Video interview with Rose Shuman, founder, Ope...",,https://www.theguardian.com/activate/video-int...


In [None]:
# Listing 7-27
# save it in a file named multiprocessing_testing_main.py

from multiprocessing import Pool                                                
import multiprocessing       
import os
def run_process(process):                                                             
        os.system('python {}'.format(process))  
if __name__ == '__main__':

    sample_file_path = 'multiprocessing_testing_processes.py'

    #print(sample_file_path)                                                         
    processes = []                                                  
    pool_count = multiprocessing.cpu_count()
    print("cpu pool count is " + " " + str(pool_count))
    for item in range(pool_count):
        processes.append(str(sample_file_path))
    processes = tuple(processes)    
                                         

    #logging.info("pooled processes started")                                                                               
    pool = Pool(pool_count)                                                        
    pool.map(run_process, processes)
    # add code here to shut off EC2


cpu pool count is  8


In [128]:
# fetching person sitemaps (not shown in text)
import time
temp_list = []
for sitemap_person_url in sitemap_persons:
    time.sleep(5)
    r = requests.get(url = sitemap_person_url, headers = my_headers)
    soup = BeautifulSoup(r.text, 'xml')
    sitemaps = soup.find_all('url')
    for sitemap in sitemaps:
        
        temp_dict = {}
        temp_dict['url'] = sitemap.find('loc').get_text()
        
        try:
            last_modified = sitemap.find('lastmod').get_text()
           
        except:
            last_modified = ''
           
            
        temp_dict["last_modified"] = last_modified
        temp_list.append(temp_dict)
        
        


In [129]:
# fetching person sitemaps (not shown in text)

import pandas as pd
import numpy as np

df = pd.DataFrame(temp_list)
df.head()


Unnamed: 0,last_modified,url
0,2020-01-25,https://muckrack.com/__jackguy
1,2020-05-08,https://muckrack.com/__jbernstein
2,2019-04-18,https://muckrack.com/__jessclark
3,2020-08-12,https://muckrack.com/__jevans__
4,2020-05-28,https://muckrack.com/__katieoconnor


In [131]:
# fetching person sitemaps (not shown in text)

len(df)

231732

In [130]:
# fetching person sitemaps (not shown in text)

df.to_csv("muckrack_persons_fetchlist.csv")