# Introduction

This is a Jupyter Notebook adapted example of the sample code covered in the PyLadies Boston Webscraping presentation. The notebook is designed to run in Google Colab.

The Selenium sample does not work in Google Colab due to permission limitations.

# Prerequisite: Install Dependencies

## Step 1: Install Python dependencies

In [None]:
!pip install crawl4ai Scrapy pandas bs4 selenium playwright requests nest-asyncio

## Step 2: Install system dependencies

In [None]:
!playwright install-deps  # Activate playwright
!playwright install

# Crawl4AI Sample

In [None]:
import nest_asyncio
import asyncio
from crawl4ai import *

nest_asyncio.apply() # Needed for Jupyter Notebook only

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/OpenAI",
        )
        print(result.markdown)
        print(result.pdf)

asyncio.run(main())


# Pandas Sample

In [None]:
import pandas as pd

def main():
    # Reads every table in the webpage into a list of DataFrames
    result = pd.read_html("https://en.wikipedia.org/wiki/OpenAI")
    print("First table:")
    print(result[0])
    print("Second table:")
    print(result[1])

if __name__ == "__main__":
    main()

# Beautiful Soup Sample

In [None]:
import requests
from bs4 import BeautifulSoup

def main():
    # Make HTTP request to Wikipedia to retrieve HTML
    response = requests.get("https://en.wikipedia.org/wiki/OpenAI")

    # Load HTML into Beautiful Soup HTML parser
    soup = BeautifulSoup(response.content, 'html.parser')

    # Retrieve the first table using CSS Selectors
    result = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table.infobox.ib-company.vcard")

    # Print the result
    print(result)


if __name__ == "__main__":
    main()

# Scrapy Sample

In [None]:
!scrapy startproject scrapy_sample # Create a new scrapy project

## Create a Scrapy "Spider"
Next, you will need to create a file under `my_project/spiders/` and add the following code:

```python
'''
Step 1: Create a new Scrapy project

scrapy startproject my_project

Step 2: Create a new spider in my_project/spiders/ folder
Step 3: Run the crawler

cd my_project
scrapy crawl first  # without saving
scrapy crawl first -O first.json # with saving

Step 4 (Optional): Update settings.py to limit visits

DEPTH_LIMIT = 3

'''
import scrapy


class MyFirstSpider(scrapy.Spider):
    name = "first"
    start_urls = [
        "https://www.torontopubliclibrary.ca/search.jsp?Ntt=python&Ndrs=",
    ]
    def clean(self, data):
        if data is None:
            return data
        else:
            return data.strip()
    def parse(self, response):
        for book in response.css("div.description"): # div.description.small-9.medium-10.columns
            yield {
                "title": self.clean(book.css("div.title.align-top > a > span::text").get()),
                "author": self.clean(book.css("div.format-year > span::text").get()),
                "year": self.clean(book.css("div.format-year > div > strong > span.date::text").get()),
            }

        next_page = response.css("#search-bar-bottom > div > ul > li.pagination-next > a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
```

## Update Scrapy settings
After, you need to edit `scrapy_sample/scrapy_sample/settings.py` and replace the content with the following:

```python
# Scrapy settings for scrapy_sample project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "scrapy_sample"

SPIDER_MODULES = ["scrapy_sample.spiders"]
NEWSPIDER_MODULE = "scrapy_sample.spiders"

ADDONS = {}


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "scrapy_sample (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False # changed for presentation
DEPTH_LIMIT = 3 # changed for presentation

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "scrapy_sample.middlewares.ScrapySampleSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "scrapy_sample.middlewares.ScrapySampleDownloaderMiddleware": 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    "scrapy_sample.pipelines.ScrapySamplePipeline": 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
```
## Run the crawler

Lastly, run the crawler!

In [None]:
!cd scrapy_sample
!scrapy crawl first # run the crawler

# Selenium Sample

NOTE: This example does not work in Colab. You will need to run Jupyter Notebook locally.



In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By

def main():
    # Use Chrome to visit webpages
    driver = webdriver.Chrome() # Will not work in Colab

    # Visit OpenAI webpage
    driver.get("https://en.wikipedia.org/wiki/OpenAI")

    # Find link (anchor) element that goes to Sam Altman's Wiki page using XPATH
    sam_altman_link_element = driver.find_element(By.XPATH, '//*[@id="mw-content-text"]/div[1]/table[1]/tbody/tr[5]/td/div/ul/li[1]/a')

    # Visit Sam Altman's wiki page
    sam_altman_link_element.click()

    # Check what the current page is
    current_url = driver.current_url
    print(f"We are currently on: {current_url}")

    # Print content of current page (can be passed to Beautiful Soup HTML parser)
    print(f"{driver.page_source[0:1000]}...")

    # Close driver
    driver.close()

if __name__ == "__main__":
    main()