In [11]:
from scrapling import Fetcher

# Fetcher

Do http GET request to a web page and create an Adaptor instance

In [66]:
fetcher = Fetcher(auto_match=False)

page = fetcher.get("https://quotes.toscrape.com/", stealthy_headers=True)

[2025-03-06 17:25:28] INFO: Fetched (200) <GET https://quotes.toscrape.com/> (referer: https://www.google.com/search?q=toscrape)


Get all text content from all HTML tags in the page except 'script' and 'style' tags

In [15]:
text = page.get_all_text(ignore_tags=("script", "style"))
print(text)

Quotes to Scrape

    

        

            

                

                    
Quotes to Scrape

                

                
                    
Login

    


    

        
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
by 
Albert Einstein
(about)

            Tags:
            
change
deep-thoughts
thinking
world

        
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
by 
J.K. Rowling
(about)

            Tags:
            
abilities
choices

        
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
by 
Albert Einstein
(about)

            Tags:
            
inspirational
life
live
miracle
miracles

        
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
by 
Jane Austen
(about)

            Tags:
            
aliteracy
boo

Get quotes elements by CSS selector
- ".quote .text::text" -> get text content from HTML elements that contains both 'quote' and 'text' class 

In [16]:
quotes = page.css(".quote .text::text") # CSS selector
quotes

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

"//span[@class='text']/text()": This is the XPath expression that specifies what to select.

- // -> This means "select nodes in the document from the current node that match the selection, regardless of their location." In simpler terms, it searches the entire document.
- span -> This specifies that you are looking for <span> elements.
- [@class='text'] -> This part filters the <span> elements to only those that have a class attribute equal to "text".
- /text() -> This indicates that you want to extract the text content of the selected <span> elements, not the elements themselves.

In [17]:
quotes = page.xpath("//span[@class='text']/text()")
quotes

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

Chain selectors

In [18]:
quotes = page.css('.quote').css('.text::text')
quotes

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

Slower than bulk query above

In [19]:
quotes = [element.text for element in page.css('.quote .text')]
quotes

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

Get first quote element
- Same as page.css(".quote").first
- Same as page.css(".quote")[0]

In [22]:
quote = page.css_first(".quote .text::text")
quote

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [30]:
quote = page.css(".quote .text").first.text
quote

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [27]:
quote = page.css(".quote .text::text")[0]
quote

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

Get all 'div' HTML tags that one of its 'class' values is 'quote'

- quotes = page.find_all("div", {"class": "quote"})
- quotes = page.find_all('div', class_='quote')
- quotes = page.find_all(['div'], class_='quote')
- quotes = page.find_all(class_='quote')

In [31]:
quotes = page.find_all("div", {"class": "quote"})
quotes

[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" i

In [35]:
quotes = page.find_all('div', class_='quote')
quotes

[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 <data='<div class="quote" i

Get inner HTML of this element

In [69]:
print(quotes[0].html_content)

<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
        <span>by <small class="author" itemprop="author">Albert Einstein</small>
        <a href="/author/Albert-Einstein">(about)</a>
        </span>
        <div class="tags">
            Tags:
            <meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world"><a class="tag" href="/tag/change/page/1/">change</a>
            
            <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
            
            <a class="tag" href="/tag/thinking/page/1/">thinking</a>
            
            <a class="tag" href="/tag/world/page/1/">world</a>
            
        </div>
    </div>


Prettified version of Inner HTML above

In [43]:
print(quote.prettify())

<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
        <span>by <small class="author" itemprop="author">Albert Einstein</small>
        <a href="/author/Albert-Einstein">(about)</a>
        </span>
        <div class="tags">
            Tags:
            <meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world">
<a class="tag" href="/tag/change/page/1/">change</a>
            
            <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
            
            <a class="tag" href="/tag/thinking/page/1/">thinking</a>
            
            <a class="tag" href="/tag/world/page/1/">world</a>
            
        </div>
    </div>



Get element's attributes

In [45]:
quote.attrib

AttributesHandler({'class': 'quote', 'itemscope': '', 'itemtype': 'http://schema.org/CreativeWork'})

DOM path to element (List of all ancestors from <html> tag till the element itself)

In [46]:
quote.path

[<data='<div class="col-md-8"> <div class="quote...' parent='<div class="row"> <div class="col-md-8">...'>,
 <data='<div class="row"> <div class="col-md-8">...' parent='<div class="container"> <div class="row...'>,
 <data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
 <data='<body> <div class="container"> <div clas...' parent='<html lang="en"><head><meta charset="UTF...'>,
 <data='<html lang="en"><head><meta charset="UTF...'>]

In [54]:
page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
page

[2025-03-06 17:09:55] INFO: Fetched (200) <GET https://httpbin.org/get> (referer: https://www.google.com/search?q=httpbin)


<data='<html><body><p>{ "args": {}, "headers":...'>

In [56]:
print(page.get_all_text())

{
  "args": {}, 
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 
    "Accept-Encoding": "gzip, deflate, br, zstd", 
    "Accept-Language": "en-US;q=1.0", 
    "Host": "httpbin.org", 
    "Referer": "https://www.google.com/search?q=httpbin", 
    "Sec-Ch-Ua": "\"Microsoft Edge\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", 
    "Sec-Ch-Ua-Mobile": "?0", 
    "Sec-Ch-Ua-Platform": "\"Windows\"", 
    "Sec-Fetch-Dest": "navigate", 
    "Sec-Fetch-Mode": "same-site", 
    "Sec-Fetch-Site": "?1", 
    "Sec-Fetch-User": "document", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", 
    "X-Amzn-Trace-Id": "Root=1-67c96663-3e8406e76ecceb586c5fe7b4"
  }, 
  "origin": "129.126.150.210", 
  "url": "https://httpbin.org/get"
}


In [70]:
quotes[0].tag

'div'

# AsyncFetcher

In [57]:
from scrapling import AsyncFetcher
page = await AsyncFetcher().get("https://httpbin.org/get", stealthy_headers=True, follow_redirects=True)
page.get_all_text()

[2025-03-06 17:12:52] INFO: Fetched (200) <GET https://httpbin.org/get> (referer: https://www.google.com/search?q=httpbin)


'{\n  "args": {}, \n  "headers": {\n    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", \n    "Accept-Encoding": "gzip, deflate, br, zstd", \n    "Accept-Language": "en-US;q=1.0", \n    "Host": "httpbin.org", \n    "Referer": "https://www.google.com/search?q=httpbin", \n    "Sec-Ch-Ua": "\\"Google Chrome\\";v=\\"131\\", \\"Chromium\\";v=\\"131\\", \\"Not_A Brand\\";v=\\"24\\"", \n    "Sec-Ch-Ua-Mobile": "?0", \n    "Sec-Ch-Ua-Platform": "\\"macOS\\"", \n    "Sec-Fetch-Dest": "navigate", \n    "Sec-Fetch-Mode": "same-site", \n    "Sec-Fetch-Site": "?1", \n    "Sec-Fetch-User": "document", \n    "Upgrade-Insecure-Requests": "1", \n    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", \n    "X-Amzn-Trace-Id": "Root=1-67c96714-3743e02b5d263a805b887b9c"\n  }, \n  "origin": "129.126.150.210", \n  "url": "http

# StealthyFetcher

- Only allows 'async_fetch'.

In [61]:
from scrapling import StealthyFetcher

page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection')
page.status

  funcs = list(self._events.get(event, OrderedDict()).values())
[2025-03-06 17:15:30] INFO: Fetched (200) <GET https://www.browserscan.net/bot-detection> (referer: https://www.google.com/search?q=browserscan)


200

# PlayWrightFetcher

- Same as StealthyFetcher i.e. only available in async mode.
- Unable to bypass Google.

In [72]:
from scrapling import PlayWrightFetcher

page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)  # Vanilla Playwright option
page.css_first("#search a::attr(href)")

[2025-03-06 17:38:46] INFO: Fetched (200) <GET https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3D%2522Scrapling%2522%26sei%3DJW3JZ_L6D63H4-EP_POE2AU&q=EgSBfpbSGKXapb4GIjD2KM1FR4nx3HACT1hG2Xla6TOjdDi1MbgU775PlEFeAqH4uqMs2Q3fDcTWL8IRAgwyAXJaAUM> (referer: https://www.google.com/search?q=google)


In [73]:
page.get_all_text()

'https://www.google.com/search?q=%22Scrapling%22&sei=JW3JZ_L6D63H4-EP_POE2AU\n\n\n\n\n\n\n\n\n\n  In order to continue, please enable javascript on your web browser.\n\n\n\nAbout this page\nWhy did this happen?\n\nThis page appears when Google automatically detects requests coming from your computer network which appear to be in violation of the \nTerms of Service\nLearn more'