## NOTE: 
Listings 8-1 to 8-8 should be run in scrapy directory itself; they are provided here so that you better understand the code progression.

In [None]:
# Listing 8-1: default settings.py file contents
BOT_NAME = 'chapter_8'
SPIDER_MODULES = ['chapter_8.spiders']
NEWSPIDER_MODULE = 'chapter_8.spiders'
ROBOTSTXT_OBEY = True

In [None]:
#Listing 8-2: additional settings.py contents
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100Safari/537.36'

CONCURRENT_REQUESTS = 1

DOWNLOAD_DELAY = 0.05

DOWNLOAD_TIMEOUT = 15

REDIRECT_ENABLED = True

DEPTH_LIMIT = 3
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'

LOG_LEVEL = 'INFO'

In [None]:
#Listing 8-3: items.py default contents
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class Chapter8Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

In [None]:
#Listing 8-4: items.py fields
import scrapy

class Chapter8Item(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()
    depth = scrapy.Field()

In [None]:
#Listing 8-5: linkscraper_basic.py default contents
# -*- coding: utf-8 -*-
import scrapy

class LinkscraperBasicSpider(scrapy.Spider):
    name = 'linkscraper-basic'
    allowed_domains = ['jaympatel.com']
    start_urls = ['http://jaympatel.com/']

    def parse(self, response):
        pass


In [None]:
#Listing 8-6: complete linkscraper_basic.py function.
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from chapter_8.items import Chapter8Item

class LinkscraperBasicSpider(scrapy.Spider):
    name = 'linkscraper-basic'
    allowed_domains = ['jaympatel.com']
    start_urls = ['http://jaympatel.com/']

    def parse(self, response):
 
            item = Chapter8Item()
            if response.headers["Content-Type"] == b'text/html; charset=utf-8' or response.headers["Content-Type"] == b'text/html':
                soup = BeautifulSoup(response.text,'html.parser')
                urls = soup.find_all('a', href=True)
                for val in soup.find_all('title'):
                    try:
                        item["url"] = response.url
                        item["title"] = val.get_text()
                        item["depth"] = str(response.meta['depth'])
                        yield item
                    except Exception as E:
                        print(str(E))
                
            else:
                item["title"] = 'title not extracted since content-type is ' + str(response.headers["Content-Type"])
                item["url"] = response.url
                item["depth"] = str(response.meta['depth'])
                urls = []
                yield item
            
            
            
            for url in urls:
                yield response.follow(url['href'], callback=self.parse)

In [3]:
#Listing 8-7: Exploring pages.jl file
import json
file_path = 'pages.jl'

contents = open(file_path, "r").read()
data = [json.loads(str(item)) for item in contents.strip().split('\n')]
for dd in data:
    print(dd)
    print("*"*10)

{'url': 'http://jaympatel.com/', 'title': 'Jay M. Patel', 'depth': '0'}
**********
{'url': 'http://jaympatel.com/tags/', 'title': 'Jay M. Patel', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/2019/02/using-twitter-rest-apis-in-python-to-search-and-download-tweets-in-bulk/', 'title': '\n    Using Twitter rest APIs in Python to search and download tweets in bulk – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/2019/02/top-data-science-interview-questions-and-answers/', 'title': '\n    Top data science interview questions and answers – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/2019/02/natural-language-processing-nlp-text-vectorization-and-bag-of-words-approach/', 'title': '\n    Natural language processing (NLP): text vectorization and bag of words approach – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/categories/', 'title': 'Jay M. Patel', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/201

In [None]:
#Listing 8-8: additional settings.py parameters for using s3pipeline
ITEM_PIPELINES = { 's3pipeline.S3Pipeline': 100}

S3PIPELINE_URL = 's3://athena-us-east-1-testing/chapter-8/{time}.{chunk:07d}.jl.gz'

S3PIPELINE_MAX_CHUNK_SIZE = 10000

S3PIPELINE_GZIP = True

# If different than AWS CLI configure values

AWS_REGION_NAME = 'us-east-1'

AWS_ACCESS_KEY_ID = ‘YOUR_VALUE’
AWS_SECRET_ACCESS_KEY = ‘YOUR_VALUE’
               

In [2]:
#Listing 8-9: jl.gz output from S3 folder
import gzip
import json
file_path_gzip= 'FILENAME_ON_S3.jl.gz'
data = []
with gzip.open(file_path_gzip,'r') as fin:        

    for item in fin:
        #print('got line', data.append(json.loads(item)))
        data.append(json.loads(item))
for dd in data:
    print(dd)
    print("*"*10)

{'url': 'http://jaympatel.com/', 'title': 'Jay M. Patel', 'depth': '0'}
**********
{'url': 'http://jaympatel.com/', 'title': 'Jay M. Patel', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/about/', 'title': '\n    About – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/consulting-services/', 'title': '\n    Consulting – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/books/', 'title': '\n    Books – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/2020/01/introduction-to-machine-learning-metrics/', 'title': '\n    Introduction to machine learning metrics – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/2019/02/introduction-to-web-scraping-in-python-using-beautiful-soup/', 'title': '\n    Introduction to web scraping in python using Beautiful Soup – Jay M. Patel\n', 'depth': '1'}
**********
{'url': 'http://jaympatel.com/2019/02/why-is-web-scraping-essential-and-who-uses-web-scraping/',

In [None]:
#Listing 8-10: Modifying items.py file to capture raw web crawl data
class Chapter8ItemRaw(scrapy.Item)
    headers = scrapy.Field()
    url = scrapy.Field()
    response = scrapy.Field()
    crawl_date = scrapy.Field()

In [None]:
#Listing 8-11: second-scraper.py
# -*- coding: utf-8 -*-
import scrapy
from datetime import datetime, timezone
from scrapy.linkextractors import LinkExtractor
from chapter_8.items import Chapter8ItemRaw


class SecondScraperSpider(scrapy.Spider):
    name = 'second-scraper'
    allowed_domains = ['jaympatel.com']
    start_urls = ['http://jaympatel.com/']

    def parse(self, response):
        
        
            item = Chapter8ItemRaw()
            item['headers'] = str(response.headers)
            item['url'] = response.url
            item['body'] = response.text
            item['crawl_date'] = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
            yield item
            
            for a in LinkExtractor().extract_links(response):
                yield response.follow(a, callback=self.parse) 

In [7]:
#Listing 8-12: Parsing jl.gz containing raw web crawls

import gzip
import json
from bs4 import BeautifulSoup

file_path_gzip= 'FILENAME_ON_S3.jl.gz'

data = []
with gzip.open(file_path_gzip,'r') as fin:        
        for item in fin:
            data.append(json.loads(item))
for dd in data:
    print(dd["url"])
    #print(dd["headers"])
    soup = BeautifulSoup(dd["response"],'html.parser')
    print(soup.find('title').get_text())
    print("*"*10)

http://jaympatel.com/
Jay M. Patel
**********
http://jaympatel.com/
Jay M. Patel
**********
http://jaympatel.com/about/

    About – Jay M. Patel

**********
http://jaympatel.com/consulting-services/

    Consulting – Jay M. Patel

**********
http://jaympatel.com/books/

    Books – Jay M. Patel

**********
http://jaympatel.com/2020/01/introduction-to-machine-learning-metrics/

    Introduction to machine learning metrics – Jay M. Patel

**********
http://jaympatel.com/2019/02/introduction-to-web-scraping-in-python-using-beautiful-soup/

    Introduction to web scraping in python using Beautiful Soup – Jay M. Patel

**********
http://jaympatel.com/2019/02/why-is-web-scraping-essential-and-who-uses-web-scraping/

    Why is web scraping essential and who uses web scraping? – Jay M. Patel

**********
http://jaympatel.com/2020/08/how-to-create-pdf-documents-in-python-using-fpdf-library/

    How to create pdf documents in python using FPDF library – Jay M. Patel

**********
http://jaympat

In [1]:
# Listing 8-13: fetching amazon.com captures through cc-index api
import urllib

def get_index_url(query_url):

    query = urllib.parse.quote_plus(query_url)
    base_url = 'https://index.commoncrawl.org/CC-MAIN-2020-16-index?url='
    index_url = base_url + query + '&output=json'
    return index_url
query_url = 'amazon.com/*'
index_url = get_index_url(query_url)

import re
import time
import gzip
import json
import requests
try:
    from io import BytesIO
except:
    from StringIO import StringIO
def get_index_json(index_url):
    pages_list = []
    #payload_content = None
    
    for i in range(4):
        resp = requests.get(index_url)
        #print(resp.status_code)

        time.sleep(0.2)

        if resp.status_code == 200:
            for x in resp.content.strip().decode().split('\n'):
                page = json.loads(x)
                
                try:
                    
                    pages_list.append(page)

                except:
                    pass
            
            break
    return pages_list


index_json = get_index_json(index_url)
print(len(index_json))

13622


In [2]:
# Listing 8-14: exploring status codes for amazon.com page captures
import numpy as np
import pandas as pd

df = pd.DataFrame(index_json)

In [3]:
df.head()

Unnamed: 0,charset,digest,filename,languages,length,mime,mime-detected,offset,redirect,status,timestamp,truncated,url,urlkey
0,UTF-8,LVIFLF2KAXZEAFMDVTYY776KYNJROEPG,crawl-data/CC-MAIN-2020-16/segments/1585370490...,eng,66670,text/html,text/html,828564605,,200,20200328075237,,https://www.amazon.com/,"com,amazon)/"
1,,Z6IJ46JXZU7TCLCDINT3OMVFHV5GZPYU,crawl-data/CC-MAIN-2020-16/segments/1585370490...,,711,text/html,text/html,4866000,https://www.amazon.com/,301,20200328081133,,http://www.amazon.com/,"com,amazon)/"
2,,Z6IJ46JXZU7TCLCDINT3OMVFHV5GZPYU,crawl-data/CC-MAIN-2020-16/segments/1585370491...,,708,text/html,text/html,1180759,https://www.amazon.com/,301,20200328105528,,http://www.amazon.com/,"com,amazon)/"
3,,QOQN4QA34672OVBRYEUSI3RWPYEEYZWD,crawl-data/CC-MAIN-2020-16/segments/1585370491...,,2778,text/html,text/html,18933473,,503,20200328124702,,https://www.amazon.com/,"com,amazon)/"
4,,3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ,crawl-data/CC-MAIN-2020-16/segments/1585370492...,,498,unk,application/octet-stream,1226739,https://www.amazon.com/,301,20200328170442,,http://www.amazon.com,"com,amazon)/"


In [4]:
# Listing 8-14: exploring status codes for amazon.com page captures (cont.)

df.status.value_counts()

503    6753
301    5274
200     897
302     635
404      58
400       5
Name: status, dtype: int64

In [29]:
# Listing 8-15: page with 503 status code
page = df[df.status == '503'].iloc[1].to_dict()

In [30]:
print(page)

{'charset': nan, 'digest': 'YK3ZEOTAVRL7MSJFY3IQHPCJOLVI4Q6Y', 'filename': 'crawl-data/CC-MAIN-2020-16/segments/1585370492125.18/crawldiagnostics/CC-MAIN-20200328164156-20200328194156-00241.warc.gz', 'languages': nan, 'length': '2592', 'mime': 'text/html', 'mime-detected': 'text/html', 'offset': '17355310', 'redirect': nan, 'status': '503', 'timestamp': '20200328181710', 'truncated': nan, 'url': 'https://www.amazon.com/', 'urlkey': 'com,amazon)/'}


In [6]:
# Listing 8-15: page with 503 status code

import re
import time
import gzip
import json
import requests
try:
    from io import BytesIO
except:
    from StringIO import StringIO

def get_from_index(page):
    
    offset, length = int(page['offset']), int(page['length'])
    offset_end = offset + length - 1
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    
    try:

        r = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
        raw_data = BytesIO(r.content)
        f = gzip.GzipFile(fileobj=raw_data)
        data = f.read()
      
    except:

        print('some error in connection?')

    try:
        crawl_metadata, header, response = data.strip().decode('utf-8').split('\r\n\r\n', 2)
    except Exception as e:
        pass
        print(e)
    
    return crawl_metadata, header, response

In [33]:
# Listing 8-15: page with 503 status code (Cont.)

crawl_metadata, header, response = get_from_index(page)
soup = BeautifulSoup(response,'html.parser')
for script in soup(["script", "style"]): 
        script.extract()
print(soup.get_text())










Robot Check













Enter the characters you see below
Sorry, we just need to make sure you're not a robot. For best results, please make sure your browser is accepting cookies.










Type the characters you see in this image:








Try different image











Continue shopping











Conditions of Use




Privacy Policy


          © 1996-2014, Amazon.com, Inc. or its affiliates
          









In [None]:
# Listing 8-16: using proxy IP addresses with requests
##NOTE: these proxy_ip addresses are just placeholders for explaining how the code works
import requests

proxy_ip = {
 'http': 'http://11.11.11.11:8010',
 'https': 'http://11.11.11.11:8010',
}

my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + ' (KHTML, like Gecko) Chrome/61.0.3163.100Safari/537.36'
}

r = requests.get(url, proxies=proxy_ip, headers = my_headers)

In [22]:
# Listing 8-17: Randomly generated user agent

from fake_useragent import UserAgent
ua = UserAgent()
print(ua.random)

Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36


In [None]:
# Listing 8-18: pseudocode for capchta solving service
from selenium import webdriver
browser = webdriver.Chrome

captcha_site_key = browser.find_element_by_class_name('g-recaptcha').get_attribute('data-sitekey')
#...(call captcha solving service API with site key and url
# It will return back  g_response_code
js_code = 'document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(g_response_code)

browser.execute_script(js_code)
# Now perform whatever action you need to do on the page like hitting a submit button
browser.find_element_by_tag_name('form').submit()