####   This script scrapes and collects information from a collection search page on the Smithsonian Institution's website, storing the data in Excel files. It includes a DataSheet class to handle Excel file creation, row addition, and saving, ensuring that headers are properly included. The get_info function extracts detailed information about individual items from their specific pages, parsing the HTML to gather attributes and their values. The MyThread function manages the scraping process by initializing an Excel sheet, iterating through search result pages, extracting item details, and updating the sheet. Finally, the script divides the total number of pages to be processed across multiple threads, facilitating concurrent data collection and organization.

In [None]:
import math
import os.path
import threading

import openpyxl
import requests
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup

sheetHeaders = ["Name", "Provenance", "Collection", "Previous custodian or owner", "Origin", "Credit Line", "Type",
                "Restrictions and Rights", "Period", "Geography", "Material", "Dimension", "EDAN ID"]


class DataSheet(object):
  

    def __init__(self, filename):
        self.filename = filename

        if not os.path.exists(self.filename):
            wb = openpyxl.Workbook()
            wb.save(self.filename)
        self.wb = openpyxl.load_workbook(self.filename)
        self.sheet = self.wb.active
       
        global sheetHeaders
        if self.sheet.max_row == 1:
            self.sheet.append(sheetHeaders)

    def add_row(self, data):
        if self.sheet:
            self.sheet.append(data)

    def get_last_page(self):
        if self.sheet:
            global sheetHeaders
            return self.sheet.cell(row=1, column=len(sheetHeaders) + 1).value
        return None

    def add_last_page(self, value):
        if self.sheet:
            global sheetHeaders
            self.sheet.cell(row=1, column=len(sheetHeaders) + 1).value = str(value)

    def save(self):
        if self.wb:
            self.wb.save(self.filename)


def get_info(url):
    """
    获取藏品详情
    :param web:
    :param url:
    :return: 返回字典
    """
    # web.get(url)
    data = {}
    try:
        response = requests.get(url)
        html = BeautifulSoup(response.text, "html.parser")

        for li in html.find("div", class_="individual-object-details").find("ul").find_all("li"):
            try:
                head = li.find("h3").text.strip()
                content = "\n".join([div.text.strip() for div in li.find("div").find_all("div")])
            except Exception as e:
                print(str(e))
                continue
            data[str(head)] = content
        for li in html.find("div", class_="individual-object-at-a-glance__attributes").find("ul").find_all("li"):
            try:
                head = li.find("h3").text.strip()
                content = li.find("div").text.strip()
            except Exception as e:
                print(str(e))
                continue
            data[str(head)] = content
    except Exception as e:
        print(str(e))
    return data


def MyThread(start_page, num):
    # 也可以用requests
    # web = Chrome()
    dataSheet = DataSheet("./search_%d.xlsx" % start_page)
    lastPage = dataSheet.get_last_page()
    if lastPage is None or lastPage == "":
        lastPage = start_page
    else:
        print(lastPage)
        lastPage = int(lastPage)
    for page in range(lastPage, (start_page + num)):
        try:
            url = f"https://asia.si.edu/explore-art-culture/collections/search/?edan_q=Charles+Lang+Freer&listStart={page}"
            # web.get(url)
            response = requests.get(url)
            html = BeautifulSoup(response.text, "html.parser")
            "search-results-image-grid__result"
            for div in html.find_all("div", class_="search-results-image-grid__result"):
                try:
                    img_url = div.find("img", class_="search-results-image-grid__result-image").get("src")
                except Exception as e:
                    print(str(e))
                    img_url = None
                a = div.find("a", class_="secondary-link")
                name = a.text.strip().replace('"', '')
                sub_url = "https://asia.si.edu" + a.get("href")
                result = get_info(sub_url)
                sheetData = [name]
                global sheetHeaders
                for i in range(1, len(sheetHeaders)):
                    item = result.get(sheetHeaders[i], "")
                    sheetData.append(item)
                dataSheet.add_row(sheetData)
            dataSheet.add_last_page(page)
            dataSheet.save()
        except Exception as e:
            print(str(e))
            continue
    print(dataSheet.filename + " finished!")


if __name__ == "__main__":
    thread_num = 6
    page_num = math.ceil(13590 / 12)
    # page_num = 12
    page_num_per_thread = math.ceil(page_num / thread_num)
    for i in range(0, thread_num):
        try:
            xd = threading.Thread(target=MyThread, args=(page_num_per_thread * i, page_num_per_thread))
            xd.start()  # 启动一个线程
        except Exception as e:
            print("run " + str(e))

