In [83]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from tabulate import tabulate
import re
import os
import math

## Start here
First, we query `papyri.info` for all Dura papyri with images. Then we scrape some initial data from the search results, including the papyri.info URLs.

In [6]:
# URL to query all papyri from Dura-Europos with images present
QUERY_URL = "https://papyri.info/search?INT=on&EXT=on&PLACE=Dura+-+Europos&DOCS_PER_PAGE=1000"

page = requests.get(QUERY_URL).text
soup = BeautifulSoup(page, 'html.parser')

def parseResultRow(r):
    url = r.find("a").get("href")
    url = "https://papyri.info" + url.split("?")[0]
    
    main_identifier = r.find("td", class_="identifier").text
    
    other_identifiers = [] 
    for o in r.find("td", class_="identifier").get("title").split("="):
        if len(o.strip()):
            other_identifiers.append(o.strip())
    
    title = r.find("div", class_="title-long").text
    
    location = r.find("td", class_="display-place").text
    date = r.find("td", class_="display-date").text    
    languages = r.find("td", class_="language").text.strip()
    return {
        "Papyri.info URL": url,
        "Main identifier": main_identifier,
        "Other identifiers": ",".join(other_identifiers),
        "Title": title,
        "Location": location,
        "Date": date,
        "Languages": languages,
    }


result_rows = soup.find_all("tr", class_="result-record")
row_dicts = [parseResultRow(r) for r in result_rows]

In [7]:
table = []
table.append(list(row_dicts[0].keys()))
for r in row_dicts:
    table.append(r.values())
tabulate(table[1:], table[0], tablefmt='html')

Papyri.info URL,Main identifier,Other identifiers,Title,Location,Date,Languages
https://papyri.info/ddbdp/basp;56;100,basp 56 100,BASP 56,Roster?,Dura - Europos,200 CE - 230 CE,"la, la"
https://papyri.info/ddbdp/chla;6;310,chla 6 310,"P.Dura 55,P.CtYBR inv. DP 105 qua,yale 4401050000",File of Letters,Dura - Europos,218 CE - 220 CE,"grc, grc, la, la"
https://papyri.info/ddbdp/chla;6;311,chla 6 311,"P.Dura 56,P.CtYBR inv. DP 8 fol,yale 4400080000","Letters from Provincial Headquarters, Assigning Mounts",Dura - Europos,208 CE,"la, la"
https://papyri.info/ddbdp/chla;6;312,chla 6 312,"P.Dura 57,P.CtYBR inv. DP 32,yale 4400320000",Fragment of Letters,Dura - Europos,208 CE,"la, la"
https://papyri.info/ddbdp/chla;6;313,chla 6 313,"P.Dura 58,P.CtYBR inv. DP 63(B) qua,yale 4400634200",Copy of Letter Assigning Mounts,Dura - Europos,240 CE - 250 CE,"la, la"
https://papyri.info/ddbdp/chla;6;314,chla 6 314,"P.Dura 59,P.CtYBR inv. DP 7 qua,yale 4400070000",Letter from a Governor of Syria,Dura - Europos,241 CE,"la, la"
https://papyri.info/ddbdp/chla;6;315,chla 6 315,"ChLA 6 315,P.CtYBR inv. DP 4 qua,yale 4400040000",Lettres circulaires,Dura - Europos,208 CE,"la, la"
https://papyri.info/ddbdp/chla;6;316,chla 6 316,"ChLA 6 316,P.CtYBR inv. DP 18 qua,yale 4400180000",Lettre officielle concernant les «frumentations»,Dura - Europos,216 CE,"la, la"
https://papyri.info/ddbdp/chla;6;317,chla 6 317,"P.Dura 62,P.CtYBR inv. DP 33,yale 4400330000",Fragmentary Letter,Dura - Europos,216 CE - 220 CE,"la, la"
https://papyri.info/ddbdp/chla;6;318,chla 6 318,"P.Dura 63,P.CtYBR inv. DP 10(Second text) qua,yale 4400100032",Fragmentary Correspondence,Dura - Europos,211 CE,"la, la"


Now for each record, we try to fetch a Yale library image URL:

In [9]:
for i in range(len(row_dicts)):
    r = row_dicts[i]
    print("Fetching", r["Papyri.info URL"], "... ", end="", flush=True)
    page = requests.get(r["Papyri.info URL"]).text
    print("done.")
    soup = BeautifulSoup(page, 'html.parser')
    
    apis_data = soup.find("div", class_="apis")
    if not apis_data:
        print("    No Yale APIS listing found. Can we find any images at all? ", end="", flush=True)
        images_hdr = soup.find("th", string="Images")        
        images_url = images_hdr.parent.find("a").get("href")
        r["Image URL"] = images_url
        if (images_hdr):
            print("Yes!")
        else:
            print("No.")
        continue
        
    images_hdr = apis_data.find("th", string="Images")
    images_url = images_hdr.parent.find("a").get("href")
    r["Image URL"] = images_url
    print("    Found APIS image URL.")

Fetching https://papyri.info/ddbdp/basp;56;100 ... done.
    No Yale APIS listing found. Can we find any images at all? Yes!
Fetching https://papyri.info/ddbdp/chla;6;310 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;311 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;312 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;313 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;314 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;315 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;316 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;317 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;318 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;319 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/chla;6;320 ... done.

Fetching https://papyri.info/ddbdp/p.dura;;45 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;49 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;51 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;52 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;53 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;59 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;62 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;63 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;65 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;67 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;68 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/p.dura;;69 ... done.
    Found APIS image URL.
Fetching https:/

Fetching https://papyri.info/ddbdp/rom.mil.rec;1;92 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;93 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;94 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;95 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;96 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;97 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;98 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;99 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;100 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;101 ... done.
    Found APIS image URL.
Fetching https://papyri.info/ddbdp/rom.mil.rec;1;103 ... done.
    Found APIS image URL.
Fetching https://papyri.info/

This will give us (hopefully) a link to the Yale Library listing page for each one. Now we have to go into it and fetch the links to the actual image files.

In [41]:
for i in range(len(row_dicts)):
    r = row_dicts[i]
    print("Fetching " + r["Image URL"] + " ... ", end = "", flush = True)
    page = requests.get(r["Image URL"]).text
    print("done.")
    soup = BeautifulSoup(page, 'html.parser')
    thumbnails_div = soup.find("div", class_="complexThumbnails")
    if thumbnails_div is not None:
        thumb_URLs = [im.get("src") for im in thumbnails_div.findAll("img")]
        image_URLs = []
        for t in thumb_URLs:
            coll_id = re.search(r"digcoll:[0-9]*/", t).group(0)
            image_URLs.append("https://imageserver.library.yale.edu/" + coll_id + "a/b/15000.jpg")
        if len(image_URLs)>0:
            print(f"    Added {len(image_URLs)} direct image URLS.")
        r["Direct Image URLs"] = ",".join(image_URLs)
    else:
        print("    Couldn't find thumbnails div. Is this a non-Yale listing?")

Fetching http://hdl.handle.net/10079/digcoll/2771868 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771859 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771675 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771672 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771446 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771269 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771356 ... done.
    Added 6 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771547 ... done.
    Added 4 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771447 ... done.
    Added 4 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771449 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771363 ... don

Fetching http://hdl.handle.net/10079/digcoll/2771780 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771763 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771679 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771673 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771930 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771758 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771776 ... done.
    Added 2 direct image URLS.
Fetching http://wwwapp.cc.columbia.edu/ldpd/app/apis/item?mode=item&key=yale.apis.3100270000 ... done.
    Couldn't find thumbnails div. Is this a non-Yale listing?
Fetching http://hdl.handle.net/10079/digcoll/2771692 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771867 ... done.
    Added 2 direct i

Fetching http://hdl.handle.net/10079/digcoll/2771527 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771658 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771691 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771872 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771666 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771660 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771674 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771541 ... done.
    Added 1 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771459 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771267 ... done.
    Added 2 direct image URLS.
Fetching http://hdl.handle.net/10079/digcoll/2771677 ... don

And store them to a file:

In [115]:
with open("catalog.tsv", "w") as f:
    f.write("\t".join(row_dicts[0].keys()) + "\n")
    for r in row_dicts:
        for k in row_dicts[0].keys():
            if k not in r:
                r[k] = "None"
        f.write("\t".join([r[k] for k in row_dicts[0].keys()]) + "\n")

Now that we have a bunch of image URLs, we need to download the actual images and organize them on our filesystem.

**Note: if you have a `catalog.tsv` file, you can just resume from here without running the scraping.**

In [70]:
row_dicts = []
with open("catalog.tsv", "r") as f:
    keys = next(f).strip().split("\t")
    for l in f:
        r = {}
        values = l.strip().split("\t")
        for k, v in zip(keys, values):
            r[k] = v
        
        row_dicts.append(r)

Let's check if we can use the artifact IDs as unique identifiers in our system:

In [72]:
ids_seen = []
for r in row_dicts:
    id_nospaces = r["Main identifier"].replace(" ", "_")
    if id_nospaces not in ids_seen:
        ids_seen.append(id_nospaces)
    else:
        print("Error: found duplicate ID " + id_nospaces)

print(f"Found {len(ids_seen)} unique IDs for {len(row_dicts)} items. Storing in dict.")
for i in range(len(row_dicts)):
    row_dicts[i]["Filesystem identifier"] = row_dicts[i]["Main identifier"].replace(" ", "_")

Found 231 unique IDs for 231 items. Storing in dict.


And now we download all the files, cataloging them by the identifiers we've generated:

In [98]:
def mkdir(path):
    if not os.path.exists(path):
        os.mkdir(path)
    return path

mkdir("./images")
for in row_dicts:
    papyrus_dir = mkdir("./images/" + r["Filesystem identifier"])
    if "Direct image URLs" not in r:
        print(f"No image links for {papyrus_dir}, skipping.")
        continue
        
    image_urls = r["Direct image URLs"].split(",")
    num_pairs = math.ceil(len(image_urls)/2)
    
    for i, url in enumerate(image_urls):
        pair_dir = mkdir(papyrus_dir + f"/{int(i/2)+1:02}")
        file_name = pair_dir + "/" + ("front.jpg" if (i%2==0) else "back.jpg")
        if os.path.exists(file_name):
            print(f"Skipping {file_name}")
            continue
        print(f"Fetching {file_name} ... ", end="", flush=True)
        img_data = requests.get(url).content
        with open(file_name, 'wb') as f:
            f.write(img_data)
        print("done.")

Skipping ./images/basp_56_100/01/front.jpg
Skipping ./images/basp_56_100/01/back.jpg
Skipping ./images/chla_6_310/01/front.jpg
Skipping ./images/chla_6_310/01/back.jpg
Skipping ./images/chla_6_311/01/front.jpg
Skipping ./images/chla_6_311/01/back.jpg
Skipping ./images/chla_6_312/01/front.jpg
Skipping ./images/chla_6_312/01/back.jpg
Skipping ./images/chla_6_313/01/front.jpg
Skipping ./images/chla_6_314/01/front.jpg
Skipping ./images/chla_6_314/01/back.jpg
Skipping ./images/chla_6_315/01/front.jpg
Skipping ./images/chla_6_315/01/back.jpg
Skipping ./images/chla_6_315/02/front.jpg
Skipping ./images/chla_6_315/02/back.jpg
Skipping ./images/chla_6_315/03/front.jpg
Skipping ./images/chla_6_315/03/back.jpg
Skipping ./images/chla_6_316/01/front.jpg
Skipping ./images/chla_6_316/01/back.jpg
Skipping ./images/chla_6_316/02/front.jpg
Skipping ./images/chla_6_316/02/back.jpg
Skipping ./images/chla_6_317/01/front.jpg
Skipping ./images/chla_6_317/01/back.jpg
Skipping ./images/chla_6_317/02/front.jpg
S

Fetching ./images/p.dura_40/01/front.jpg ... done.
Fetching ./images/p.dura_40/01/back.jpg ... done.
Fetching ./images/p.dura_41/01/front.jpg ... done.
Fetching ./images/p.dura_41/01/back.jpg ... done.
Fetching ./images/p.dura_42/01/front.jpg ... done.
Fetching ./images/p.dura_42/01/back.jpg ... done.
Fetching ./images/p.dura_45/01/front.jpg ... done.
Fetching ./images/p.dura_45/01/back.jpg ... done.
Fetching ./images/p.dura_49/01/front.jpg ... done.
Fetching ./images/p.dura_49/01/back.jpg ... done.
Fetching ./images/p.dura_51/01/front.jpg ... done.
Fetching ./images/p.dura_51/01/back.jpg ... done.
Fetching ./images/p.dura_52/01/front.jpg ... done.
Fetching ./images/p.dura_52/01/back.jpg ... done.
Fetching ./images/p.dura_53/01/front.jpg ... done.
Fetching ./images/p.dura_59/01/front.jpg ... done.
Fetching ./images/p.dura_59/01/back.jpg ... done.
Fetching ./images/p.dura_62/01/front.jpg ... done.
Fetching ./images/p.dura_62/01/back.jpg ... done.
Fetching ./images/p.dura_62/02/front.jpg

Fetching ./images/rom.mil.rec_1_54/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_55/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_56/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_57/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_57/01/back.jpg ... done.
Fetching ./images/rom.mil.rec_1_60/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_61/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_62/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_62/01/back.jpg ... done.
Fetching ./images/rom.mil.rec_1_66/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_83/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_88/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_91/01/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_91/01/back.jpg ... done.
Fetching ./images/rom.mil.rec_1_91/02/front.jpg ... done.
Fetching ./images/rom.mil.rec_1_91/02/back.jpg ... done.
Fetching ./images/rom.mil.rec_1_92/01/front.jpg ... done.
Fetching ./images/

And write out to a file once more, so we have the filesystem IDs in the table too.

In [116]:
with open("catalog.tsv", "w") as f:
    f.write("\t".join(row_dicts[0].keys()) + "\n")
    for r in row_dicts:
        for k in row_dicts[0].keys():
            if k not in r:
                r[k] = "None"
        f.write("\t".join([r[k] for k in row_dicts[0].keys()]) + "\n")