In [14]:
from typing import Tuple
import pandas as pd
import glob
import os.path as op
import re
import json
import yaml
import requests

gallica = pd.read_excel("./metadata/monographies-gallica.xlsx")
persee = pd.read_csv("./metadata/metadata-persee.csv")
datacatal = pd.read_csv("./metadata/data-catalogue.csv", names=["ark", "jpg", "Year", "page"])

In [44]:
def get_or_download(ark):
    fpath = f"./.manifests-cache/{ark}.json"
    if not op.exists(fpath):
        try:
            req = requests.get(f"https://gallica.bnf.fr/iiif/ark:/12148/{ark}/manifest.json")
            req.raise_for_status()
        except Exception:
            return None
        with open(fpath, "w") as f:
            f.write(req.text)
    try:
        with open(fpath) as f:
            d = json.load(f)
    except:
        return None
    return d

In [16]:
persee["sub"] = persee["identifier"].apply(lambda x: x.split("/")[-1]) 
persee_ids = persee["sub"].tolist()

In [17]:
datacatal["ark"] = datacatal.ark.apply(lambda x: x.split("/")[-1])
datacatal.jpg = datacatal.jpg.apply(lambda x: x.replace("é", "e"))
dcjpg = datacatal.jpg.tolist()
datacatal

Unnamed: 0,ark,jpg,Year,page
0,bpt6k1240055q,bienaime-feuardent_12148_bpt6k1240055q_f38.jpg,1866,f38
1,bpt6k1240055q,bienaime-feuardent_12148_bpt6k1240055q_f86.jpg,1866,f86
2,bpt6k1240127r,bienaime-feuardent_12148_bpt6k1240127r_f1.jpg,1899,f1
3,bpt6k1240127r,bienaime-feuardent_12148_bpt6k1240127r_f22.jpg,1899,f22
4,bpt6k12402755,bienaime-feuardent_12148_bpt6k12402755_f13.jpg,1870,f13
...,...,...,...,...
1421,CVP03250_19240411,Lair-Dubreuil_CVP03250_19240411_f6.jpg,1924,f6
1422,CVP03254_19250508,Lair-Dubreuil_CVP03254_19250508_f124.jpg,1925,f124
1423,CVP03254_19250508,Lair-Dubreuil_CVP03254_19250508_f1.jpg,1925,f1
1424,CVP03258_19250612,Lair-Dubreuil_CVP03258_19250612_f177.jpg,1925,f177


In [62]:
re_gallica = re.compile(r"(b[dp]+[A_Za-z0-9]+)_[f]?(\d+)_jp[e]?g")
re_bpt = re.compile(r"((?:bpt6|bd6t|cb)[a-z0-9]+)_")
re_persee = re.compile(r"^[^_]+_[^_]+_(\d{4})_(?:[a-z]{3})_")

data = []
theses = []
downloaded = {}
with open("data/data.yaml") as f:
    config = yaml.safe_load(f)
names = config["names"]

is_gallica = 0

def year_string_to_year_certainty(year: str, default: int = None) -> Tuple[int, bool]:
    certainty = False
    if year is None:
        return year, certainty
    if isinstance(year, int):
        return year, True
        
    if year.isnumeric():
        return int(year), True
        
    if not year.endswith("-") and "-" in year:
        year = year.split("-")[-1]
    
    if year.endswith("-"):
        year = int(year.replace("-", "00"))
    elif year.endswith("."):
        year = int(year.replace(".", "0"))
    elif year.endswith("?"):
        year = int(year.replace("?", "0"))
        
    return year, certainty

for file in glob.glob("./data/*/*/images/*"):
    _, _, subset, split, *_ = file.split("/")
    base, _ = op.splitext(op.basename(file))
    dirname = op.dirname(file)

    defaults = {"file": op.basename(file), "subset": subset, "split": split}
        
    if base.startswith("theatre"):
        year, certainty = year_string_to_year_certainty(base.split("_")[1], default=1600)
        data.append({**defaults, "year": year, "dating-certainty": certainty})
        continue
        
    if subset == "monographies":
        ark = re_bpt.findall(base)
        entry = gallica[gallica.URL == f"https://gallica.bnf.fr/ark:/12148/{ark[0]}"]
        if not entry.empty:
            entry = entry.iloc[0].to_dict()
            year, certainty = year_string_to_year_certainty(entry["DATES"])
            data.append({**defaults, "year": year, "dating-certainty": certainty})
            continue
            
    if base.startswith("PG0"):
        data.append({**defaults, "year": 1857, "dating-certainty": False})
        continue
        
    if subset == "fingers":
        data.append({**defaults, "year": 2000, "dating-certainty": False})
        continue
        
    if subset == "these":
        date = base.replace(("these_"), "")[:4]
        data.append({**defaults, "year": int(date) if date.isnumeric() else None, "dating-certainty": True})
        if data[-1]["year"] is None:
            data[-1]["year"] = 2019 # seems like handle based identifiers are all 2019 
            data[-1]["dating-certainty"] = False # seems like handle based identifiers are all 2019 
        continue
    
    if subset == "typewriter":
        year = base.replace(("Tapuscrit_"), "")[:4]
        year, certainty = (int(year), True) if year.isnumeric() else (None, False)
        data.append({**defaults, "year": year, "dating-certainty": certainty})
        continue
        
    if subset == "picard":
        data.append({**defaults, "year": int(base.split("_")[2]), "dating-certainty": True})
        continue
        
    if "magazineJV" in base:
        date = base.replace(("magazineJV_"), "")[:4]
        data.append({**defaults, "year": int(date) if date.isnumeric() else None, "dating-certainty": True})
        continue

    if "Le_Prince_Dgem_chronique_dauphinoise" in base:
        defaults["subset"] = "monographies"
        data.append({**defaults, "year": 1860, "dating-certainty": True})
        continue
    
    if subset == "persee":
        year = re_persee.findall(base)[0]
        data.append({**defaults, "year": year_string_to_year_certainty(year)[0], "dating-certainty": True})
        continue

    if subset == "catalogue":
        year = datacatal[datacatal["jpg"].str.contains(base)]
        if not year.empty:
            year, certainty = year_string_to_year_certainty(year.iloc[0].to_dict()["Year"])
            data.append({**defaults, "year": int(year), "dating-certainty": certainty})   
            continue
        else:
            ark = re_bpt.findall(base)[0]
            manifest = get_or_download(group)
            if not manifest:
                data.append({**defaults, "year": None, "certainty": False})
            else:
                year = ([elem["value"] for elem in manifest["metadata"] if elem["label"] == "Date"] or [None])[0]
                year, certainty = year_string_to_year_certainty(year)
                data.append({**defaults, "year": int(year), "dating-certainty": certainty})
            continue
            

In [63]:
pd.DataFrame(data).to_csv("metadata.csv", index=False)

In [65]:
print(pd.DataFrame(data).pivot_table(index="subset", values="year", columns="split", aggfunc="count").fillna(0)[["train", "valid", "test"]].to_markdown())

| subset        |   train |   valid |   test |
|:--------------|--------:|--------:|-------:|
| catalogue     |    1071 |     265 |     89 |
| fingers       |      51 |       6 |      3 |
| magazine-tech |     187 |      30 |     20 |
| monographies  |    1689 |     203 |    101 |
| others        |       5 |       1 |      0 |
| persee        |     799 |      97 |     47 |
| picard        |      87 |       6 |      4 |
| theatre       |     497 |      61 |     62 |
| these         |     463 |      69 |     39 |
| typewriter    |      78 |       9 |      8 |


In [None]:

    if "Picard_Concours_" in base:
        print(base)
        date = base.replace(("Picard_Concours_"), "")[:4]
        data.append({
            "file": op.basename(file),
            "year": int(date) if date.isnumeric() else None,
            "dating-certainty": True,
            "subset": "Agence Picard"
        })
    elif "sohim_0398-3811_2007_edc_34_1_907_0457" in base:
        data.append({
            "file": op.basename(file),
            "year": 2007,
            "dating-certainty": True,
            "subset": "Persée"
        })
    else:
        if pid in persee_ids:
            data.append({
                "file": op.basename(file),
                "year": persee[persee["sub"] == base].iloc[0].to_dict()["date"],
                "dating-certainty": True,
                "subset": "Persée"
            })
            continue
        if "catalogue" in file:
            year = datacatal[datacatal["jpg"] == op.basename(file)]
            if not year.empty:
                year = year.iloc[0].to_dict()["Year"]
                certainty = True
                if "-" in year:
                    year = year.split("-")[-1]
                    certainty = False
                if year.endswith(".."):
                    year = year.replace(".", "0")
                    certainty = False
                data.append({
                    "file": op.basename(file),
                    "dating": int(year),
                    "certainty": certainty,
                    "subset": "DataCatalogue"
                })   
                continue
        else:
            if group := re_bpt.findall(base):
                group = group[0]
                year = datacatal[datacatal["ark"] == group].head(1)["Year"]
                if not year.empty:
                    year = year.tolist()[0]
                    data.append({
                        "file": op.basename(file),
                        "year": datacatal[datacatal["ark"] == group].head(1)["Year"].iloc[0],
                        "dating-certainty": True,
                        "subset": "DataCatalogue"
                    })
                else:
                    certainty = True
                    j = get_or_download(group)
                    if not j:
                        data.append({
                            "file": op.basename(file),
                            "year": None,
                            "dating-certainty": certainty,
                            "subset": "Monographies"
                        })
                    else:
                        for elem in j["metadata"]:
                            if elem["label"] == "Date":
                                year = elem["value"]
                                if "-" in year:
                                    year = year.split("-")[-1]
                                    certainty = False
                                if "." in year:
                                    year = year.replace(".", "0")
                                    certainty = False
                                if "?" in year:
                                    year = year.replace("?", "0")
                                    certainty = False
                                year = int(year)
                                break
                        data.append({
                            "file": op.basename(file),
                            "year": year,
                            "dating-certainty": certainty,
                            "subset": "Monographies"
                        })
                continue # print(base)
            else:
                print(file)