# Colab specific setup

In [21]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
%cd "drive/MyDrive/Summer 2022 GSoC/gsoc-su22-eda/pdf"

[Errno 2] No such file or directory: 'drive/MyDrive/Summer 2022 GSoC/gsoc-su22-eda/pdf'
/content/drive/MyDrive/Summer 2022 GSoC/gsoc-su22-eda/pdf


We first locate the pubs that are matched to primary publications.

In [5]:
import pandas as pd

In [6]:
pubs = pd.read_csv("../merged_pubs.csv")
authors = pd.read_csv("../authors.csv")
pubs_authors = pd.read_csv("../authors_publications.csv")

In [7]:
pubs["year"] = pubs["year"].fillna("NULL")
authors["last"] = authors["last"].fillna("NULL")

In [8]:
pubs_and_authors = pubs.merge(pubs_authors, left_on = "id", right_on = "publication_id", 
                              how = "inner").merge(authors, left_on = "author_id", right_on = "id", how = "inner")

In [10]:
import os
all_pdf_folders = os.listdir("../../ocr-output/")
all_pdf_folders.sort()

In [11]:
id_folder_map = {}
for idx, row in pubs_and_authors.iterrows():
    author_name = row["last"]
    year = row["year"]
    for folder in all_pdf_folders:
        if (author_name in folder) and (year in folder):
            id_folder_map[row["id_x"]] = folder

In [13]:
import re
import pandas as pd

In [14]:
proveniences = pd.read_csv("../proveniences.csv", index_col = 'id')[["provenience"]]

In [15]:
prov_names = proveniences["provenience"].str.findall(r"(.+) \(mod\. (.+)\)").str[0]

In [16]:
proveniences["ancient_name"] = prov_names.str[0]

In [17]:
proveniences["modern_name"] = prov_names.str[1]

In [18]:
proveniences = proveniences.drop(207)
proveniences.loc[242, "ancient_name"] = "Qattara"
proveniences.loc[242, "modern_name"] = "Tell al Rimah"
proveniences.loc[360, "ancient_name"] = "Kian"
proveniences.loc[360, "modern_name"] = "Tell Shmid"

In [19]:
def get_txt_as_str(col):
    col = col.fillna("")
    result = ""
    for e in col:
        result += e + " "
    return result

In [20]:
def get_prov_occurences(full_txt, prov):

    def find_total_occurence(row):
        ancient_name = row["ancient_name"]
        modern_name = row["modern_name"]
        pattern_a = r'\b' + ancient_name + r'\b'
        occur_a = 0 if ancient_name == "uncertain" else len(re.findall(pattern_a, full_txt))
        pattern_b = r'\b' + modern_name + r'\b'
        occur_b = 0 if modern_name == "uncertain" else len(re.findall(pattern_b, full_txt))
        return occur_a + occur_b

    prov["total_occurences"] = prov.apply(find_total_occurence, axis = "columns")
    return prov

In [None]:
pgrs = 0

In [None]:
with open("progress.txt", "r") as f:
    pgrs = int(f.read())

In [None]:
all_folders = list(id_folder_map.values())
all_folders.sort()

In [None]:
for subfolder in all_folders[pgrs:]:
    if not os.path.exists(f"../../ocr-output/{subfolder}/page.csv"):
        continue
    page = pd.read_csv(f"../../ocr-output/{subfolder}/page.csv")
    txt = get_txt_as_str(page["text"])
    occur = get_prov_occurences(txt, proveniences.copy())
    occur_sorted = occur.sort_values("total_occurences", ascending = False)
    occur_sorted.to_csv(f"../../ocr-output/{subfolder}/occurences.csv")
    pgrs += 1
    with open("progress.txt", "w") as f:
        f.write(str(pgrs))

Generate publications_proveniences.

In [None]:
pubs = pubs.set_index("id")

In [None]:
prov_pubs = []
for pub_id, folder_name in id_folder_map.items():
    occurence_csv = pd.read_csv(f"../../ocr-output/{folder_name}/occurences.csv")
    related_provs = occurence_csv[occurence_csv["total_occurences"] > 0]
    for e in related_provs["id"]:
        prov_pubs.append([pub_id, e])

In [None]:
prov_pubs_df = pd.DataFrame(prov_pubs, columns = ["publication_id", "provenience_id"])

In [29]:
prov_pubs_df_complete = pd.DataFrame(prov_pubs_df[["publication_id", "provenience_id"]])

In [32]:
prov_pubs_df_complete.rename(columns = {"provenience_id" : "entity_id"}, inplace = True)

In [34]:
prov_pubs_df_complete.insert(loc = len(prov_pubs_df_complete.columns), column = "exact_reference", value = "")

In [35]:
prov_pubs_df_complete.insert(loc = len(prov_pubs_df_complete.columns), column = "publication_type", value = "citation")

In [36]:
prov_pubs_df_complete.insert(loc = len(prov_pubs_df_complete.columns), column = "publication_comments", value = "")

In [38]:
prov_pubs_df_complete.insert(loc = len(prov_pubs_df_complete.columns), column = "table_name", value = "proveniences")

In [40]:
prov_pubs_df_complete.to_csv("publication_proveniences.csv")