In [1]:
# Listing 6-1: Fetching file paths for warc files
import requests
import gzip
url = 'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-16/warc.paths.gz'
from io import BytesIO

r = requests.get(url)
compressed_file = BytesIO(r.content)
f = gzip.GzipFile(fileobj=compressed_file)
print(f.read(326).decode("utf-8"))


crawl-data/CC-MAIN-2020-16/segments/1585370490497.6/warc/CC-MAIN-20200328074047-20200328104047-00000.warc.gz
crawl-data/CC-MAIN-2020-16/segments/1585370490497.6/warc/CC-MAIN-20200328074047-20200328104047-00001.warc.gz
crawl-data/CC-MAIN-2020-16/segments/1585370490497.6/warc/CC-MAIN-20200328074047-20200328104047-00002.warc.gz


In [2]:
# Listing 6-2: Downloading warc file
warc_path =  'crawl-data/CC-MAIN-2020-16/segments/1585370490497.6/warc/CC-MAIN-20200328074047-20200328104047-00455.warc.gz'
file_name = ‘YOUR_LOCAL_FILEPATH.warc.gz’
import boto3

from botocore.handlers import disable_signing
resource = boto3.resource('s3')
resource.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)

bucket = resource.Bucket('commoncrawl')

resource.meta.client.download_file('commoncrawl', warc_path, file_name)

In [3]:
# Listing 6-2: Downloading warc file (Cont.)


from time import time

import warc

def process_warc(file_name, limit=10000):
    warc_file = warc.open(file_name, 'rb')
    t0 = time()
    n_documents = 0

    url_list = []
    header_list = []
    html_content = []
    
    for i, record in enumerate(warc_file):
        
        if n_documents >= limit:
        
            break
        
        url = record.url
        payload = record.payload.read()
        
        try:
            header, html = payload.split(b'\r\n\r\n', maxsplit=1)
            html = html.strip()
        except:
            
            continue

        if url is None or payload is None or html == b'':

            continue

        else:
            try:
            
                html_content.append(html)
                header_list.append(header)
                url_list.append(url)
            except Exception as e:
                #print(e)
                continue
            
        n_documents += 1
        
    warc_file.close()
    print('Parsing took %s seconds and went through %s documents' % (time() - t0, n_documents))
    return header_list, html_content, url_list

In [4]:
# Listing 6-2: Downloading warc file (Cont.)

file_name = ‘YOUR_LOCAL_FILEPATH.warc.gz’
header_list, html_content, url_list = process_warc(file_name, limit = 1000000)

Parsing took 45.039262771606445 seconds and went through 54262 documents


In [16]:
# Listing 6-3:


print(url_list[867])
print('*'*10)
print(header_list[867].decode('utf-8'))
print('*'*10)
print(html_content[867].decode('utf-8'))


http://archive.griffith.ox.ac.uk/index.php/informationobject/browse?view=card&languages=en&creators=393&mediatypes=136&sort=referenceCode&sf_culture=en&levels=223&topLod=0&limit=30&sortDir=asc
**********
HTTP/1.1 200 OK
Server: nginx/1.14.0 (Ubuntu)
Date: Sat, 28 Mar 2020 10:01:19 GMT
Content-Type: text/html; charset=utf-8
X-Crawler-Transfer-Encoding: chunked
Connection: keep-alive
Set-Cookie: symfony=jjecpro8lfekf6nm09hj7qc5eb; path=/; HttpOnly
Expires: Thu, 19 Nov 1981 08:52:00 GMT
Cache-Control: no-store, no-cache, must-revalidate
Pragma: no-cache
X-Ua-Compatible: IE=edge,chrome=1
X-Crawler-Content-Encoding: gzip
Content-Length: 31469
**********
<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta http-equiv="X-Ua-Compatible" content="IE=edge,chrome=1" />
    <meta name="title" content="Griffith Institute Archive" />
<meta name="description" content="Access to memory - Open information managem

In [5]:
# not shown in text
# just verifying it as a sanity check on whether we have the right warc file or not
query_url = 'http://archive.griffith.ox.ac.uk/index.php/informationobject/browse?view=card&languages=en&creators=393&mediatypes=136&sort=referenceCode&sf_culture=en&levels=223&topLod=0&limit=30&sortDir=asc'

url_list.index(query_url)

867

In [1]:
#Listing 6-4: parameterizing url for searching common crawl index
import urllib

def get_index_url(query_url):

    query = urllib.parse.quote_plus(query_url)
    base_url = 'https://index.commoncrawl.org/CC-MAIN-2020-16-index?url='
    index_url = base_url + query + '&output=json'
    return index_url
query_url = 'http://archive.griffith.ox.ac.uk/index.php/informationobject/browse?view=card&languages=en&creators=393&mediatypes=136&sort=referenceCode&sf_culture=en&levels=223&topLod=0&limit=30&sortDir=asc'
index_url = get_index_url(query_url)
print(index_url)

https://index.commoncrawl.org/CC-MAIN-2020-16-index?url=http%3A%2F%2Farchive.griffith.ox.ac.uk%2Findex.php%2Finformationobject%2Fbrowse%3Fview%3Dcard%26languages%3Den%26creators%3D393%26mediatypes%3D136%26sort%3DreferenceCode%26sf_culture%3Den%26levels%3D223%26topLod%3D0%26limit%3D30%26sortDir%3Dasc&output=json


In [5]:
# Listing 6-5: Getting results from common crawl index

import re
import time
import gzip
import json
import requests
try:
    from io import BytesIO
except:
    from StringIO import StringIO
def get_index_json(index_url):
    
    payload_content = None
    
    for i in range(4):
        resp = requests.get(index_url)
        print(resp.status_code)

        time.sleep(0.2)

        if resp.status_code == 200:
            
            for x in resp.content.strip().decode().split('\n'):
                payload_content = json.loads(x)
            
            
            break
    return payload_content


index_json = get_index_json(index_url)

200


In [6]:
print(index_json)

{'urlkey': 'uk,ac,ox,griffith,archive)/index.php/informationobject/browse?creators=393&languages=en&levels=223&limit=30&mediatypes=136&sf_culture=en&sort=referencecode&sortdir=asc&toplod=0&view=card', 'timestamp': '20200328100119', 'status': '200', 'url': 'http://archive.griffith.ox.ac.uk/index.php/informationobject/browse?view=card&languages=en&creators=393&mediatypes=136&sort=referenceCode&sf_culture=en&levels=223&topLod=0&limit=30&sortDir=asc', 'mime': 'text/html', 'digest': 'LLZBM2KWPSEKOAK23C4J2V2FK5NLXNUC', 'charset': 'UTF-8', 'offset': '14692801', 'filename': 'crawl-data/CC-MAIN-2020-16/segments/1585370490497.6/warc/CC-MAIN-20200328074047-20200328104047-00455.warc.gz', 'length': '6409', 'mime-detected': 'text/html', 'languages': 'eng'}


In [27]:
# Listing 6-6: getting webpage data from S3 bucket.

def get_from_index(page):
    
    offset, length = int(page['offset']), int(page['length'])
    offset_end = offset + length - 1
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    
    try:

        r = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
        raw_data = BytesIO(r.content)
        f = gzip.GzipFile(fileobj=raw_data)
        data = f.read()
      
    except:

        print('some error in connection?')

    try:
        crawl_metadata, header, response = data.strip().decode('utf-8').split('\r\n\r\n', 2)
    except Exception as e:
        pass
        print(e)
    
    return crawl_metadata, header, response


In [28]:
crawl_metadata, header, response = get_from_index(index_json)

In [29]:
# not shown in text
print(crawl_metadata)

WARC/1.0
WARC-Type: response
WARC-Date: 2020-03-28T10:01:19Z
WARC-Record-ID: <urn:uuid:0d242641-21b2-4d27-bc50-ff57c1f59fc7>
Content-Length: 31926
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:577bc174-4bca-447e-84d9-6df1d660ea78>
WARC-Concurrent-To: <urn:uuid:035d4834-977d-48a2-8bdb-b3c4c89a3142>
WARC-IP-Address: 163.1.185.104
WARC-Target-URI: http://archive.griffith.ox.ac.uk/index.php/informationobject/browse?view=card&languages=en&creators=393&mediatypes=136&sort=referenceCode&sf_culture=en&levels=223&topLod=0&limit=30&sortDir=asc
WARC-Payload-Digest: sha1:LLZBM2KWPSEKOAK23C4J2V2FK5NLXNUC
WARC-Block-Digest: sha1:UTKG66E2CW3NIZTTDCHZXFZNMAVRCHNN
WARC-Identified-Payload-Type: text/html


In [32]:
# listing 6-7: header and html 
print(header)
print('*'*10)
print(response)

HTTP/1.1 200 OK
Server: nginx/1.14.0 (Ubuntu)
Date: Sat, 28 Mar 2020 10:01:19 GMT
Content-Type: text/html; charset=utf-8
X-Crawler-Transfer-Encoding: chunked
Connection: keep-alive
Set-Cookie: symfony=jjecpro8lfekf6nm09hj7qc5eb; path=/; HttpOnly
Expires: Thu, 19 Nov 1981 08:52:00 GMT
Cache-Control: no-store, no-cache, must-revalidate
Pragma: no-cache
X-Ua-Compatible: IE=edge,chrome=1
X-Crawler-Content-Encoding: gzip
Content-Length: 31469
**********
<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta http-equiv="X-Ua-Compatible" content="IE=edge,chrome=1" />
    <meta name="title" content="Griffith Institute Archive" />
<meta name="description" content="Access to memory - Open information management toolkit" />
<meta name="viewport" content="initial-scale=1.0, user-scalable=no" />
    <title>Griffith Institute Archive</title>
    <link rel="shortcut icon" href="/favicon.ico"/>
    <link href="/pl

In [27]:
# Listing 6-8: fulltext of the webpage
from bs4 import BeautifulSoup
import re
def preprocessor_final(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', ' ', text)
        text = re.sub('[\W]+', ' ', text.lower())
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            return_list.append(temp_text)
        return(return_list)
    
soup = BeautifulSoup(response,'html.parser')
for script in soup(["script","style"]): 
        script.extract()
print(preprocessor_final(soup.get_text()).replace('\n', ' '))


 griffith institute archive griffith institute archive log in have an account email password log in quick links quick links home about help global search replace privacy policy language language english français español nederlands português deutsch čeština clipboard clipboard clear all selectionsgo to clipboardload clipboardsave clipboard browse browse collectionspeople and organisationsplacessubjectsdigital objects search search advanced search filters narrow your results by language unique records 1 results 1 english 1 results 1 creator all gardiner sir alan henderson 1 results 1 level of description all collection 1 results 1 media type all image 1 results 1 showing 1 results archival description gardiner sir alan henderson collection image english advanced search options find results with and or not in any field title archival history scope and content extent and medium subject access points name access points place access points genre access points identifier reference code digita

# WET file

In [36]:
import re
def preprocessor_final(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', ' ', text)
        text = re.sub('[\W]+', ' ', text.lower())
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            return_list.append(temp_text)
        return(return_list)
    

In [37]:
# Listing 6-9: processing WET files
from time import time
import warc

file_name = ‘YOUR_LOCAL_FILEPATH.warc.gz’
wet_path = 'crawl-data/CC-MAIN-2020-16/segments/1585370490497.6/wet/CC-MAIN-20200328074047-20200328104047-00455.warc.wet.gz'
import boto3

from botocore.handlers import disable_signing
resource = boto3.resource('s3')
resource.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)

bucket = resource.Bucket('commoncrawl')

resource.meta.client.download_file('commoncrawl', wet_path, file_name)

def process_wet(file_name, limit=100):
    warc_file = warc.open(file_name, 'rb')
    t0 = time()
    n_documents = 0

    url_list = []
    #header_list = []
    html_content = []
    
    for i, record in enumerate(warc_file):
        
        
        url = record.url
        payload = record.payload.read()

        if url is None or payload is None or payload == b'':

            continue

        else:
            try:
                
                html_content.append(preprocessor_final(payload.decode('utf-8')))
                url_list.append(url)
                
            except Exception as e:
                #print(e)
                continue
            
            
        n_documents += 1
        
    warc_file.close()
    print('Parsing took %s seconds and went through %s documents' % (time() - t0, n_documents))
    return html_content, url_list

In [38]:
file_name = ‘YOUR_LOCAL_FILEPATH.warc.gz’
html_content, url_list = process_wet(file_name, limit = 10000000)


Parsing took 44.381158113479614 seconds and went through 53271 documents


In [39]:
#Listing 6-10: printing WET record
index_no = url_list.index(query_url)

In [41]:
print(html_content[index_no])

griffith institute archive griffith institute archive log in have an account email password log in quick links quick links home about help global search replace privacy policy language language english français español nederlands português deutsch čeština clipboard clipboard clear all selections go to clipboard load clipboard save clipboard browse browse collections people and organisations places subjects digital objects search search advanced search filters narrow your results by language unique records 1 results 1 english 1 results 1 creator all gardiner sir alan henderson 1 results 1 level of description all collection 1 results 1 media type all image 1 results 1 showing 1 results archival description gardiner sir alan henderson collection image english advanced search options find results with and or not in any field title archival history scope and content extent and medium subject access points name access points place access points genre access points identifier reference code 

In [10]:
# Listing 6-11: processing WET files.

from time import time
import cld2
import pandas as pd
import warc

def process_wet_with_processing(file_name, limit=100):
    warc_file = warc.open(file_name, 'rb')
    t0 = time()
    n_documents = 0

    url_list = []
    #header_list = []
    html_content = []
    
    for i, record in enumerate(warc_file):
        
        
        url = record.url
        payload = record.payload.read()

        if url is None or payload is None or payload == b'':

            continue

        else:
            try:
                
                isReliable, textBytesFound, details = cld2.detect(payload.decode('utf-8'))

                lang1 = details[0][1]
                lang1_per = details[0][2]

                lang2 = details[1][1]
                lang2_per = details[1][2]

                if lang1 == 'en' and lang1_per > 98 and lang2 == 'un' and len(str(payload).split(" ")) > 100:

                    
                    html_content.append(preprocessor_final(payload.decode('utf-8')))
                    url_list.append(url)
                
            except Exception as e:
                #print(e)
                continue
            
            
        n_documents += 1
        
    warc_file.close()
    print('Parsing took %s seconds and went through %s documents' % (time() - t0, n_documents))
    return html_content, url_list

In [11]:
file_name = ‘YOUR_LOCAL_FILEPATH’
html_content, url_list = process_wet_with_processing(file_name, limit = 10000000)

Parsing took 35.51691484451294 seconds and went through 52442 documents


In [12]:
url_list.index(query_url)

134

In [28]:
html_content[134]

'griffith institute archive griffith institute archive log in have an account email password log in quick links quick links home about help global search replace privacy policy language language english français español nederlands português deutsch čeština clipboard clipboard clear all selections go to clipboard load clipboard save clipboard browse browse collections people and organisations places subjects digital objects search search advanced search filters narrow your results by language unique records 1 results 1 english 1 results 1 creator all gardiner sir alan henderson 1 results 1 level of description all collection 1 results 1 media type all image 1 results 1 showing 1 results archival description gardiner sir alan henderson collection image english advanced search options find results with and or not in any field title archival history scope and content extent and medium subject access points name access points place access points genre access points identifier reference code

In [29]:
# Listing 6-11: processing WET files (cont.)

import pandas as pd
import numpy as np

df = pd.DataFrame({"full_text":html_content, "url":url_list})
df.head()

Unnamed: 0,full_text,url
0,close up characters 94 game answers for 100 es...,http://100escaperswalkthrough.com/category/clo...
1,105 pymble house the most beautiful asian godd...,http://105pymblehouse.com.au/profiles.php?l=131
2,5 1080p lcd hdtv free shipping 5 1080p lcd hd...,http://1080plcdhdtvfreeshipping.blogspot.com/
3,presidential maroons 12 apostrophes digression...,http://12apostrophes.net/presidential-maroons/
4,link url rotator service promote all your prog...,http://1linkurl.com/


In [30]:
# Listing 6-12: vectorizing text


from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_transformer = TfidfVectorizer(stop_words='english', 
                                   ngram_range=(1, 2), lowercase=True, max_features=20000)

X_train_text = tfidf_transformer.fit_transform(df["full_text"])
df_dtm = pd.DataFrame(X_train_text.toarray(), columns=tfidf_transformer.get_feature_names())
df_dtm.head()

Unnamed: 0,00,00 00,00 01,00 04,00 07,00 08,00 09,00 10,00 11,00 12,...,zombie,zombies,zone,zone pass,zone puck,zones,zoning,zoo,zoom,zte
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026344,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045599,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Listing 6-13: applying nearest neighbor algorithm

from sklearn.neighbors import NearestNeighbors

NN= NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='cosine', p=2, metric_params=None, n_jobs=None)
NN.fit(X_train_text)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [45]:
# Listing 6-14: calculating nearest neighbors

neigh_list = NN.kneighbors(df_dtm.iloc[134].values.reshape(1, -1),n_neighbors=5, return_distance=True)
print(neigh_list)

(array([[0.        , 0.15182213, 0.16689516, 0.2082976 , 0.21517839]]), array([[ 134, 4195,  133, 4125, 1631]], dtype=int64))


In [57]:
neigh_list[0][0].tolist()

[0.0,
 0.15182212989807664,
 0.1668951605549598,
 0.20829760331780534,
 0.215178387815946]

In [59]:
# Listing 6-15: loading neighbors into a dataframe
neigh_df = pd.DataFrame({"url_index":neigh_list[1][0].tolist(), "cosine_dist":neigh_list[0][0].tolist()})
neigh_df.head()

Unnamed: 0,cosine_dist,url_index
0,0.0,134
1,0.151822,4195
2,0.166895,133
3,0.208298,4125
4,0.215178,1631


In [60]:
# Listing 6-16: printing nearest neighbor text and urls
for i in range(len(neigh_df)):
    print(url_list[neigh_df.url_index.iloc[i]])
    print("*"*10)

http://archive.griffith.ox.ac.uk/index.php/informationobject/browse?view=card&languages=en&creators=393&mediatypes=136&sort=referenceCode&sf_culture=en&levels=223&topLod=0&limit=30&sortDir=asc
**********
https://atom.library.yorku.ca/index.php/informationobject/browse?places=556037&view=card&subjects=558646&sort=identifier&sf_culture=fi&%3Bview=card&%3Bsort=alphabetic&sortDir=asc
**********
http://archive.griffith.ox.ac.uk/index.php/informationobject/browse?sf_culture=cs&creators=9811&sortDir=desc&sort=lastUpdated&%3Bsort=lastUpdated&%3Bnames=21267&%3Blevels=223&%3BtopLod=0&%3Blimit=30
**********
https://archives.jewishmuseum.ca/informationobject/browse?sortDir=desc&creators=110543&levels=221&%3Bsubjects=400&%3Bamp%3Bsort=relevance&%3Bsort=alphabetic&sort=alphabetic
**********
http://rbscarchives.library.ubc.ca/index.php/informationobject/browse?sort=lastUpdated&places=555934%2C555933%2C555931&names=555848%2C555877%2C555869%2C555870&%3Bamp%3Bcollection=183468&%3Bamp%3Bview=card&%3Bamp%

In [33]:
# Listing 6-16: printing nearest neighbor text and urls (cont.)
html_content[4195]

'york university libraries clara thomas archives special collections ok this website uses cookies to enhance your ability to browse and load content log in have an account email password log in quick links quick links home about help privacy policy clipboard clipboard clear all selections go to clipboard load clipboard save clipboard browse browse archival descriptions people and organizations subjects places digital objects search search global search advanced search filters language unique records 1 results 1 englanti 1 results 1 place all toronto 1 results 1 yellowknife n w t 1 results 1 quebec 1 results 1 united states 1 results 1 canada 1 results 1 ontario 1 results 1 new york 1 results 1 new york city 1 results 1 subject all business and commerce 1 results 1 mining 1 results 1 gold 1 results 1 natural resources 1 results 1 showing 1 results archival description only top level descriptions new york city mining advanced search options find results with and or not in any field title

## Processing WAT file

In [None]:
# Listing 6-17: processing WAT files

# NOTE: we got this wat_path from going to https://commoncrawl.org/2020/04/march-april-2020-crawl-archive-now-available/

wat_path = 'crawl-data/CC-MAIN-2020-16/segments/1585370490497.6/wat/CC-MAIN-20200328074047-20200328104047-00000.warc.wat.gz'
file_name = ‘YOUR_LOCAL_FILEPATH.warc.gz’
import boto3

from botocore.handlers import disable_signing
resource = boto3.resource('s3')
resource.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)

bucket = resource.Bucket('commoncrawl')

resource.meta.client.download_file('commoncrawl', wat_path, file_name)


In [1]:
# Listing 6-17: processing WAT files (cont.)

from time import time
import warc

def process_wat(file_name, limit=10000):
    warc_file = warc.open(file_name, 'rb')
    t0 = time()
    n_documents = 0

    url_list = []
    header_list = []
    html_content = []
    
    for i, record in enumerate(warc_file):
        
        if n_documents >= limit:
        
            break
        
        url = record.url
        payload = record.payload.read()
        html_content.append(payload)
        url_list.append(url)
            
            
        n_documents += 1
        
    warc_file.close()
    print('Parsing took %s seconds and went through %s documents' % (time() - t0, n_documents))
    return html_content, url_list

In [2]:
file_name = ‘YOUR_LOCAL_FILEPATH.warc.gz’
html_content, url_list = process_wat(file_name, limit = 1000000)


Parsing took 17.824016332626343 seconds and went through 160415 documents


In [3]:
# Listing 6-18: Exploring WAT record
import json
sample_dict = json.loads(html_content[60000])
sample_dict

{'Container': {'Compressed': True,
  'Filename': 'CC-MAIN-20200328074047-20200328104047-00000.warc.gz',
  'Gzip-Metadata': {'Deflate-Length': '7053',
   'Footer-Length': '8',
   'Header-Length': '10',
   'Inflated-CRC': '489333750',
   'Inflated-Length': '28943'},
  'Offset': '365278980'},
 'Envelope': {'Format': 'WARC',
  'Payload-Metadata': {'Actual-Content-Length': '28338',
   'Actual-Content-Type': 'application/http; msgtype=response',
   'Block-Digest': 'sha1:XCUMWTUZQSM3TGFOTW3F7CJKXORQBVG7',
   'HTTP-Response-Metadata': {'Entity-Digest': 'sha1:2NBB2Q47ZGHHSNHIXQWMEHLTDNGVNTSD',
    'Entity-Length': '28109',
    'Entity-Trailing-Slop-Length': '0',
    'HTML-Metadata': {'Head': {'Link': [{'path': 'LINK@/href',
        'rel': 'icon',
        'url': '../site//img/favicon.png'},
       {'path': 'LINK@/href',
        'rel': 'canonical',
        'url': 'https://bestsports.com.br/bi/atlbihome.php?esp=42'},
       {'path': 'LINK@/href',
        'rel': 'alternate',
        'url': 'https:/

In [4]:
url_list[60000]

'https://bestsports.com.br/bi/atlbihome.php?esp=42'

In [9]:
sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata'].keys()

dict_keys(['Head', 'Links'])

In [13]:
sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']["Head"]["Metas"]

[{'content': 'width=device-width, initial-scale=1.0', 'name': 'viewport'},
 {'content': 'no-cache, no-store', 'http-equiv': 'Cache-Control'},
 {'content': 'no-cache, no-store', 'http-equiv': 'Pragma'},
 {'content': 'eb96de70-e940-11e9-b21b-d1121cb6cef8',
  'name': 'axl-verification'},
 {'content': 'pt-BR', 'http-equiv': 'Content-Language'},
 {'content': '0', 'http-equiv': 'Expires'}]

In [18]:
sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata'].keys()

dict_keys(['Head', 'Links'])

In [19]:
sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']["Head"].keys()

dict_keys(['Metas', 'Link', 'Title', 'Scripts'])

In [6]:
sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']["Head"]["Scripts"]

[{'path': 'SCRIPT@/src',
  'url': '//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js'},
 {'path': 'SCRIPT@/src', 'url': 'https://d3js.org/d3.v4.min.js'},
 {'path': 'SCRIPT@/src',
  'url': 'https://cdnjs.cloudflare.com/ajax/libs/d3-tip/0.7.1/d3-tip.min.js'},
 {'path': 'SCRIPT@/src', 'url': './tools/biTools2019.js'}]

In [14]:
sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']["Head"]["Title"]

'BEST sports Analytics\ufeff - Atletas - Tiro Esportivo'

In [22]:
len(sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']["Links"])

64

In [5]:
sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']["Headers"]

{'Connection': 'Keep-Alive',
 'Content-Length': '28109',
 'Content-Type': 'text/html; charset=UTF-8',
 'Date': 'Sat, 28 Mar 2020 08:49:45 GMT',
 'Keep-Alive': 'timeout=5, max=100',
 'Server': 'Apache',
 'X-Crawler-Transfer-Encoding': 'chunked'}

# technology Profiler

In [5]:
# Listing 6-19: regex search for Apache server
import json
import re
sample_dict = json.loads(html_content[60000])
sample_dict
import time
x = re.compile("(?:Apache(?:$|/([\\d.]+)|[^/-])|(?:^|\\b)HTTPD)").search(str(sample_dict))
if x:
    print("This website uses Apache server")
else:
    print("no match found")


This website uses Apache server


In [6]:
# Listing 6-20: comparing total times for 1000 iterations
import json
import re
sample_dict = json.loads(html_content[60000])
sample_dict
import time
start_time = time.time()

for i in range(1000):
    x = re.compile("(?:Apache(?:$|/([\\d.]+)|[^/-])|(?:^|\\b)HTTPD)").search(str(sample_dict))
end_time = time.time()
print("total time (for 1000 iterations) to check entire wat record: ", end_time-start_time)
start_time = time.time()
for i in range(1000):
    x = re.compile("(?:Apache(?:$|/([\\d.]+)|[^/-])|(?:^|\\b)HTTPD)").search(str(sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']["Headers"]))
end_time = time.time()
print("total time (for 1000 iterations) to check entire wat record header: ", end_time-start_time)


total time (for 1000 iterations) to check entire wat record:  0.18289828300476074
total time (for 1000 iterations) to check entire wat record header:  0.016824007034301758


In [8]:
# Listing 6-21: Checking for Google Adsense in wat records

x = re.compile("googlesyndication\\.com/").search(str(sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']["Head"]["Scripts"]
))
if x:
    print("This website uses Google Adsense")

else:
    print("no match found")


This website uses Google Adsense


In [23]:
# Listing 6-24: using builtwith library
import builtwith

import time
start_time = time.time()
print(builtwith.builtwith(url = 'none', html = html_content[60000], headers = sample_dict["Envelope"]['Payload-Metadata']['HTTP-Response-Metadata']["Headers"]))
end_time = time.time()
print(end_time-start_time)

No module named 'cffi_re2'
running slow regex
{'web-servers': ['Apache'], 'advertising-networks': ['Google AdSense']}
0.27150821685791016


# backlinks database

In [4]:
# Listing 6-26: processing wat to create dataframe for SQLite staging table
from time import time
import tld
import warc
import json

def process_wat_with_processing(file_name, limit=10000):
    warc_file = warc.open(file_name, 'rb')
    t0 = time()
    n_documents = 0

    url_list = []
    header_content_list = []
    html_content_list = []
    final_list = []
    for i, record in enumerate(warc_file):
        #print(i)
        if n_documents >= limit:
        
            break
        
        url = record.url
        payload = record.payload.read()
        temp_dict = {}
        try:
            temp_dict["url"] = url
            temp_dict["webpage_source"] = tld.get_fld(url)
            #temp_dict['url_anchor_source'] = 'none'
            sample_dict = json.loads(payload)
            doc_links = sample_dict['Envelope']['Payload-Metadata']['HTTP-Response-Metadata']['HTML-Metadata']["Links"]
            for doc in doc_links:
                if doc["path"] == 'A@/href' and 'http' in doc["url"]:
                    temp_dict["backlink_source"] = tld.get_fld(doc["url"])
                    temp_dict["backlink"] = doc["url"]

                    #temp_dict["anchor_text"] = doc.get('text', 'none')
                    final_list.append(temp_dict.copy())
            
        except Exception as E:
            #print(E)
            continue
            
        n_documents += 1
        
    warc_file.close()
    print('Parsing took %s seconds and went through %s documents' % (time() - t0, n_documents))
    return final_list

In [5]:
file_name = ‘YOUR_LOCAL_FILEPATH.warc.gz’
final_list = process_wat_with_processing(file_name, limit = 100000)

Parsing took 88.56796169281006 seconds and went through 46270 documents


In [6]:
import numpy as np
import pandas as pd

df = pd.DataFrame(final_list)
df.head()

Unnamed: 0,backlink,backlink_source,url,webpage_source
0,http://000ojfb.wcomhost.com/ushwa/,wcomhost.com,http://000ojfb.wcomhost.com/ushwa/2018-dan-pat...,wcomhost.com
1,https://www.facebook.com/USHarnessWriters,facebook.com,http://000ojfb.wcomhost.com/ushwa/2018-dan-pat...,wcomhost.com
2,https://aboutme.google.com/b/10733287508516707...,google.com,http://000ojfb.wcomhost.com/ushwa/2018-dan-pat...,wcomhost.com
3,https://twitter.com/USHWA_NATL,twitter.com,http://000ojfb.wcomhost.com/ushwa/2018-dan-pat...,wcomhost.com
4,https://www.linkedin.com/company-beta/25009386/,linkedin.com,http://000ojfb.wcomhost.com/ushwa/2018-dan-pat...,wcomhost.com


In [8]:
# Listing 6-27: creating and inserting data in SQLite backlinks database

import sqlalchemy
sqlalchemy.__version__  

from sqlalchemy import create_engine

engine = create_engine('sqlite:///sqlite_db_path.db', echo=True) # check chapter 5 for more details
conn = engine.connect()


2020-08-11 15:12:35,257 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-08-11 15:12:35,258 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:12:35,261 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-08-11 15:12:35,262 INFO sqlalchemy.engine.base.Engine ()


In [9]:
# Listing 6-27: creating and inserting data in SQLite backlinks database (cont.)


from sqlalchemy import Table, Column,UniqueConstraint, Integer, String, DateTime, MetaData, ForeignKey

metadata = MetaData()

sources = Table('sources', metadata,
    Column('source_id', Integer, primary_key=True),
    Column('source_name', String, unique=True),
    Column('source_url', String, unique=True),
    Column('source_description', String)
    )

# Ideally we should have a unique constraint in webpages table for crawl_id and webpage_url; meaning that for a given crawl, 
# we can only insert one unique value of url
# more info here https://overiq.com/sqlalchemy-101/defining-schema-in-sqlalchemy-core/

webpages = Table('webpages', metadata,
    Column('webpage_id', Integer, primary_key=True),
    Column('webpage_url', String, unique=True),
    Column('source_id', None, ForeignKey('sources.source_id')),
    )

backlinks = Table('backlinks', metadata,
    Column('backlink_id', Integer, primary_key=True),
    Column('link_id', None, ForeignKey('webpages.webpage_id')),
    Column('webpage_id', Integer),
    UniqueConstraint('webpage_id', 'link_id', name='unique_webpage_backlink')           
                 )
metadata.create_all(engine)

2020-08-11 15:12:39,391 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("sources")
2020-08-11 15:12:39,392 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:12:39,395 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("sources")
2020-08-11 15:12:39,396 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:12:39,397 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("webpages")
2020-08-11 15:12:39,398 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:12:39,399 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("webpages")
2020-08-11 15:12:39,400 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:12:39,401 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("backlinks")
2020-08-11 15:12:39,402 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:12:39,403 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("backlinks")
2020-08-11 15:12:39,403 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:12:39,406 INFO sqlalchemy.engine.base.Engine 
CREA

In [10]:
# Listing 6-27: creating and inserting data in SQLite backlinks database (cont.)


import tld
import numpy as np
import pandas as pd
#df = pd.read_csv("backlinks_sample.csv")
#df = pd.read_csv(r"C:\Users\Jay M. Patel\Documents\jupyter_notebook_workspace\getting_strucured-data-large\chapter-6\backlinks_sample2.csv")
#df = pd.read_csv(r"C:\Users\Jay M. Patel\Documents\jupyter_notebook_workspace\getting_strucured-data-large\chapter-6\backlinks_sample3.csv")
#df["anchor_text"] = df["anchor_text"].fillna('none')
df.head()
df.to_sql('staging',con = engine)

2020-08-11 15:13:19,193 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("staging")
2020-08-11 15:13:19,195 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:13:19,196 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("staging")
2020-08-11 15:13:19,196 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:13:19,199 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE staging (
	"index" BIGINT, 
	backlink TEXT, 
	backlink_source TEXT, 
	url TEXT, 
	webpage_source TEXT
)


2020-08-11 15:13:19,200 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:13:19,225 INFO sqlalchemy.engine.base.Engine COMMIT
2020-08-11 15:13:19,226 INFO sqlalchemy.engine.base.Engine CREATE INDEX ix_staging_index ON staging ("index")
2020-08-11 15:13:19,228 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:13:19,237 INFO sqlalchemy.engine.base.Engine COMMIT
2020-08-11 15:13:20,620 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-08-11 15:13:47,564 INFO sqlalchemy.engine.base.Engine INSERT INT

In [11]:
# Listing 6-27: creating and inserting data in SQLite backlinks database (cont.)


import sqlalchemy
from sqlalchemy import Table, Column,UniqueConstraint, Integer, String, DateTime, MetaData, ForeignKey
from sqlalchemy import create_engine
from sqlalchemy.engine import reflection
from sqlalchemy.sql import text

import numpy as np
import pandas as pd

In [12]:
# Listing 6-27: creating and inserting data in SQLite backlinks database (cont.)


insert_into_sources_table = text(
"INSERT OR IGNORE INTO sources (source_url) "
"SELECT webpage_source FROM staging UNION SELECT backlink_source FROM staging;"

)
conn.execute(insert_into_sources_table)

insert_into_webpages_table = text(
"INSERT OR IGNORE INTO webpages (webpage_url, source_id) "
"SELECT staging.url, sources.source_id "
"FROM staging, sources "
"WHERE staging.webpage_source = sources.source_url;"
)

conn.execute(insert_into_webpages_table)

insert_into_webpages_table2 = text(
"INSERT OR IGNORE INTO webpages (webpage_url, source_id) "
"SELECT staging.backlink, sources.source_id "
"FROM staging, sources "
"WHERE staging.backlink_source = sources.source_url;"
)

conn.execute(insert_into_webpages_table2)

insert_into_backlinks_table = text(
"INSERT  OR IGNORE INTO backlinks (link_id, webpage_id) "
"SELECT E.webpage_id, F.webpage_id "
"FROM webpages AS E,webpages AS F, staging "
"WHERE E.webpage_url = staging.backlink "
"AND F.webpage_url = staging.url;"
)

conn.execute(insert_into_backlinks_table)

conn.execute(text("DROP TABLE staging;"))

2020-08-11 15:17:33,709 INFO sqlalchemy.engine.base.Engine INSERT OR IGNORE INTO sources (source_url) SELECT webpage_source FROM staging UNION SELECT backlink_source FROM staging;
2020-08-11 15:17:33,710 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:17:40,320 INFO sqlalchemy.engine.base.Engine COMMIT
2020-08-11 15:17:40,378 INFO sqlalchemy.engine.base.Engine INSERT OR IGNORE INTO webpages (webpage_url, source_id) SELECT staging.url, sources.source_id FROM staging, sources WHERE staging.webpage_source = sources.source_url;
2020-08-11 15:17:40,380 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:17:45,538 INFO sqlalchemy.engine.base.Engine COMMIT
2020-08-11 15:17:45,591 INFO sqlalchemy.engine.base.Engine INSERT OR IGNORE INTO webpages (webpage_url, source_id) SELECT staging.backlink, sources.source_id FROM staging, sources WHERE staging.backlink_source = sources.source_url;
2020-08-11 15:17:45,591 INFO sqlalchemy.engine.base.Engine ()
2020-08-11 15:18:05,108 INFO sqlalchemy.engi

<sqlalchemy.engine.result.ResultProxy at 0x1d38f681e80>

In [18]:
#Listing 6-28: querying for the most popular backlinks in our databasesample_query = '''SELECT
    link_id,webpages.webpage_url as backlink_url, COUNT(link_id) AS Total_Count
FROM
    backlinks, webpages
    
    where 
    	backlinks.link_id = webpages.webpage_id

GROUP BY
    link_id
HAVING 
    Total_Count > 2
ORDER BY Total_Count DESC
LIMIT 1000;'''

query = conn.execute(sample_query)
#results_list = conn.execute(text(sample_query)).fetchall()

result_list = query.fetchall()
result_list_keys = query.keys()

df = pd.DataFrame(result_list, columns = result_list_keys)
df.head(10)

2020-08-11 15:47:03,745 INFO sqlalchemy.engine.base.Engine SELECT
    link_id,webpages.webpage_url as backlink_url, COUNT(link_id) AS Total_Count
FROM
    backlinks, webpages
    
    where 
    	backlinks.link_id = webpages.webpage_id

GROUP BY
    link_id
HAVING 
    Total_Count > 2
ORDER BY Total_Count DESC
LIMIT 1000;
2020-08-11 15:47:03,747 INFO sqlalchemy.engine.base.Engine ()


Unnamed: 0,link_id,backlink_url,Total_Count
0,53825,https://twitter.com/share,1164
1,53454,https://wordpress.org/,793
2,58166,https://www.blogger.com,664
3,148565,https://automattic.com/cookies,658
4,645301,https://wordpress.com/?ref=footer_blog,612
5,148563,https://gravatar.com/site/signup/,597
6,94565,http://wordpress.org/,421
7,51538,https://akismet.com/privacy/,406
8,86804,http://twitter.com/share,320
9,1121506,https://www.shopify.com?utm_campaign=poweredby...,289


In [19]:
#Listing 6-29: querying for the most popular domains in our database

sample_query = '''SELECT
    sources.source_id, sources.source_url AS Source_Url, COUNT(sources.source_url) AS Total_Count
FROM
    backlinks, webpages, sources
    
    where 
    	backlinks.link_id = webpages.webpage_id
    AND
        webpages.source_id = sources.source_id

GROUP BY
    Source_Url
HAVING 
    Total_Count > 2
ORDER BY Total_Count DESC
LIMIT 1000;'''



query = conn.execute(sample_query)

result_list = query.fetchall()
result_list_keys = query.keys()

df = pd.DataFrame(result_list, columns = result_list_keys)
df.head(10)

2020-08-11 15:48:33,009 INFO sqlalchemy.engine.base.Engine SELECT
    sources.source_id, sources.source_url AS Source_Url, COUNT(sources.source_url) AS Total_Count
FROM
    backlinks, webpages, sources
    
    where 
    	backlinks.link_id = webpages.webpage_id
    AND
        webpages.source_id = sources.source_id

GROUP BY
    Source_Url
HAVING 
    Total_Count > 2
ORDER BY Total_Count DESC
LIMIT 1000;
2020-08-11 15:48:33,010 INFO sqlalchemy.engine.base.Engine ()


Unnamed: 0,source_id,Source_Url,Total_Count
0,152418,wordpress.com,122145
1,47205,facebook.com,32291
2,19530,blogger.com,31007
3,142405,twitter.com,29723
4,48099,fc2.com,23276
5,128527,spartantown.net,14649
6,67805,instagram.com,14507
7,56792,google.com,13593
8,155985,youtube.com,11498
9,11662,arpati.blogspot.com,10039
