[Reference](https://medium.com/@datajournal/web-scraping-with-autoscraper-aaf6128c9ca2)

In [1]:
pip install autoscraper pandas

Collecting autoscraper
  Downloading autoscraper-1.1.14-py3-none-any.whl.metadata (5.3 kB)
Collecting bs4 (from autoscraper)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading autoscraper-1.1.14-py3-none-any.whl (10 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, autoscraper
Successfully installed autoscraper-1.1.14 bs4-0.0.2


# Import Libraries

In [2]:
from autoscraper import AutoScraper
import pandas as pd

# Define the Target URL and Example Data

In [3]:
url = "http://books.toscrape.com/"
wanted_list = ["A Light in the Attic", "£51.77", "Three"]

# Build the Scraper

In [4]:
scraper = AutoScraper()
scraper.build(url, wanted_list)

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas",
 '£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15',
 '£13.99',
 '£20.66',
 '£17.46',
 '£52.29',
 '£35.02',
 '£57.25',
 '£23.88',
 '£37.

# Reviewing the Results

In [5]:
results = scraper.get_result_similar(url, grouped=True)
print("Keys found by the scraper:", results.keys())

Keys found by the scraper: dict_keys(['rule_wqrg', 'rule_jo2w', 'rule_vws4'])


## Organize and Store the Data

In [6]:
columns = ["Title", "Price", "Rating"]
data = {columns[i]: results[list(results.keys())[i]] for i in range(len(columns))}
df = pd.DataFrame(data)

## Save Data to CSV

In [7]:
df.to_csv('books_data.csv', index=False)
print("Data saved to books_data.csv")

Data saved to books_data.csv


# Scraping Paginated Content

## Update URL and Sample Data

In [8]:
urls = [f"http://books.toscrape.com/catalogue/page-{i}.html" for i in range(1, 3)]

## Update URL and Sample Data

In [10]:
all_data = []
for page_url in urls:
    results = scraper.get_result_similar(page_url, grouped=True)
    data = {columns[i]: results[list(results.keys())[i]] for i in range(len(columns))}
    all_data.append(pd.DataFrame(data))
full_data = pd.concat(all_data, ignore_index=True)
full_data.to_csv('books_data_paginated.csv', index=False)

# Using AutoScraper for Complex Websites

## Define URL and Sample Data

In [11]:
url = "https://sample-movie-site.com/movies"
wanted_list = ["Inception", "2010", "8.8"]

## Train and Prune Rules

In [13]:
scraper.build(url, wanted_list)
rules_to_keep = ['rule_1kq7', 'rule_a5xp', 'rule_9vbn'] # Sample rule names for data columns
scraper.keep_rules(rules_to_keep)
scraper.save('movies_model.json')

## Extract Data with Trained Model

In [14]:
scraper.load('movies_model.json')
results = scraper.get_result_similar(url, grouped=True)
# Define columns based on rules and organize data
columns = ["Title", "Year", "Rating"]
data = {columns[i]: results[list(results.keys())[i]] for i in range(len(columns))}
df = pd.DataFrame(data)
df.to_csv('movies_data.csv', index=False)