In [1]:
import pandas as pd
import numpy as np
import json

# Run the following if you are on colab

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd "drive/MyDrive/Summer 2022 GSoC/gsoc-su22-eda"

/content/drive/MyDrive/Summer 2022 GSoC/gsoc-su22-eda


In [None]:
!pip install sciwing

In [None]:
!python -m spacy download en

In [None]:
from sciwing.models.neural_parscit import NeuralParscit 

# instantiate an object 
neural_parscit = NeuralParscit()

# Here we read in data downloaded as CSV form from ``cdli_db``.

In [2]:
pubs = pd.read_csv("publications.csv", index_col = "id")
artifs_pubs = pd.read_csv("artifacts_publications.csv")
authors_pubs = pd.read_csv("authors_publications.csv")
abbrev = pd.read_csv("abbreviations.csv")

  pubs = pd.read_csv("publications.csv", index_col = "id")
  artifs_pubs = pd.read_csv("artifacts_publications.csv")


# Step 1: Identify subsets of publications that may be curated.

A lot of fields have the form like "ATU 3, pl. 036, W 12139", where "ATU 3" is an assyriological abbreviation and what goes
after the comma is exact_reference.

ATU 3 is the designation and the pl. 036, W 12139 is the exact reference.

So first we want to pull out the real designation. The method here is to find the first occurrence of the pattern ``\w+ \d+``,
which is basically a bunch of characters followed by a bunch of numbers. The matching occurence is named "designation_first" and inserted
as a column. 

Then we pick all rows where there is a numeric character in "designation_first".

In [4]:
des_first = pubs["designation"].fillna("").str.findall(r'\w+ \d+').str[0].fillna("")
# des_first = pubs["designation"].fillna("").str.split(",", n = 1, expand = True)[0]
pubs["designation_first"] = des_first
mergeable = pubs[pubs["designation_first"].str.match(".*\d.*")]

Then we check if these abbreviations exist in the abbreviations table. We split the "designation_first" column by space and take
what goes before it. Then we check the abbreviations against the existing abbreviations, which isolates those that exist in the
abbreviations.

In [6]:
mergeable_abbrevs = mergeable["designation_first"].str.split(" ", n = 1, expand = True)[0].to_numpy()
mergeable.loc[:, "designation_abbrev"] = mergeable_abbrevs

all_abbrevs = abbrev["abbreviation"].str.replace("\s\(aka.*\)", "", regex = True).str.strip()
mergeable = mergeable[mergeable["designation_abbrev"].isin(all_abbrevs)]
mergeable.head()

  all_abbrevs = abbrev["abbreviation"].str.replace("\s\(aka.*\)", "").str.strip()


Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
468,CUSAS 35,Bartash2017,2017,,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU


We also remove entries with only 1 unique "designation_first". There's no point for merging them at this point.

In [7]:
def filter_func(df):
    return df.shape[0] > 1

mergeable = mergeable.groupby("designation_first").filter(filter_func)
mergeable

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
468,CUSAS 35,Bartash2017,2017,,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000898,"RA 98, 148",1000898,,,,,,,,,...,,,,,,,820,1,RA 98,RA
1000901,"RA 97, 171",1000901,,,,,,,,,...,,,,,,,820,1,RA 97,RA
1013101,"RA 76, 011",1013101,,,,,,,,,...,,,,,,,820,1,RA 76,RA
1013102,"RA 76, 011",1013102,,,,,,,,,...,,,,,,,820,1,RA 76,RA


Here we do a final round of filtering where we only select those pubs that has an author attached.

In [8]:
mergeable = mergeable[mergeable.index.isin(authors_pubs["publication_id"].unique())]

In [9]:
mergeable.to_csv("before_merge.csv")

In [10]:
unchanged = pubs[~pubs.index.isin(mergeable.index)]
unchanged

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,publisher,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Bulla,Abusch1981,1981,7.0,Winona Lake,,In Honor of Ernest R. Lacheman on His Seventy-...,,,,...,Eisenbrauns,,Notes on a Pair of Matching Texts: A Shepherd'...,1,,SCCNH,,820,1,
2,,AbuschSchwemer2011,,2.0,,,Corpus of Mesopotamian Anti-Witchcraft Rituals...,,,,...,Brill: Leiden,,,8,,Ancient Magic and Divination,,820,1,
3,,Alizadeh2008,,2.0,,,Chogha Mish II: The Development of a Prehistor...,,,,...,The Oriental Institute: Chicago,,,130,,OIP,,820,1,
4,,Allred2006,,15.0,,,Cooks and Kitchens: Centralized Food Productio...,,,,...,,Johns Hopkins University,,,,,,820,1,
5,Girsu Labor,Allred2008,2008,7.0,,,On the Third Dynasty of Ur: Studies in Honor o...,,,,...,The American Schools of Oriental Research,,Labor Assignments from the City of Girsu,1,,The Journal of Cuneiform Studies Supplemental ...,,820,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000901,"RA 97, 171",1000901,,,,,,,,,...,,,,,,,,820,1,RA 97
1013101,"RA 76, 011",1013101,,,,,,,,,...,,,,,,,,820,1,RA 76
1013102,"RA 76, 011",1013102,,,,,,,,,...,,,,,,,,820,1,RA 76
1113429,"Woestenburg, Els, AfO 44/45 (1997/1998) 354",1113429,,,,,,,,,...,,,,,,,,820,1,AfO 44


# Step 2: Try to merge some of them.

In [11]:
# This part is added after Adam's proofreading

# Fixed some of the failures of mis-identifying designation_first
# The program will go through every element in ``find_and_fix``, for every element...
# The rows whose designation contains that element will have their designation_first field will be changed
find_and_fix = ["AOS 32", "Arch. 79", "AAT", "BBR", "BTT", "BWL", "CHJ", "CIRPL",
    "Dreams", "EDATS", "EEN", "FAOS 19", "Fs Kraus", "Fs Leichty",
    "Fs Lenoble", "Fs Matous", "Fs Owen", "Fs Pettinato", "Fs Sachs",
    "Fs Sigrist", "Fs Steve", "GCCI", "ITT 1", "ITT 2", "ITT 5", "KAR",
    "KAV", "Kish", "LKU", "MAD 1", "MCT", "MDP 31", "MEE 03", "MSKH",
    "MSL 04", "MSL 10", "MSL 11", "MSL 13", "MSL 14", "MSL 17", "MSL 4",
    "MSL 5", "MSL 6", "MSL 7", "MSL 8/1", "MSL 8/2", "MSL 9", "MSL SS",
    "MVAG 8/5", "NATN", "NFT", "Nisaba 04", "Nisaba 05", "Nisaba 12",
    "NRV", "OIP 138", "OrSP 06", "Phoenix 23", "Phoenix Ancient Art",
    "Proverbs", "PRT", "RIMA 1", "RIMA 2", "RIMA 3", "RIMB 1", "RIMB 2",
    "RIME 1", "RIME 2", "RIME 3", "RIME 4", "RINAP 1", "RINAP 3",
    "RINAP 4", "RINAP 5", "RMA", "RSO 05", "RT 22", "SBH", "SET", "SLFN",
    "TCND", "TCNU", "TJA", "TJDB", "TMH 2-3", "TMN", "UCP 09-02", "UNT",
    "VS 01", "YNER 4"]

for e in find_and_fix:
    mergeable.loc[mergeable["designation"].str.contains(e), "designation_first"] = e

In [14]:
replacement_indices = []
for abbrev_name in mergeable["designation_first"].unique():

    # Has same name
    has_same_name = mergeable["designation_first"] == abbrev_name

    # Find all entries with that abbreviation name
    subset = mergeable[has_same_name]

    # Skip if only 1
    if subset.shape[0] == 1:
        continue

    # Split into entries to be replaced (to_replace), and what they will be changed to (replace_with)
    # Prioritize those with a book title or a title as the "representative"
    represent = subset[~subset["book_title"].isna() | ~subset["title"].isna()]

    # If no "representative" exist, just choose the first one
    if represent.shape[0] < 1:
        represent = subset.iloc[0:1]
    represent = represent.iloc[0:1]
    assert represent.shape[0] > 0
    
    # Fetch the id these publications will be changed to
    replace_id = represent.index.item()

    # Fetch the id of the publications to be replaced
    ids_to_replace = subset.index[~subset.index.isin([replace_id])]

    # Append the indices to replace to the correct places
    replacement_indices.append([ids_to_replace, replace_id])

# Here's the code that generates all indices to be merged

In [15]:
for e in replacement_indices:
    e[0] = list(e[0])

json_dump = []
for e in replacement_indices:
    json_dump.append({"merge" : e[0],"to" : e[1]})
    
with open("merge_metadata.json", "w") as f:
    f.write(json.dumps(json_dump, indent = 4))

# Step 1 & 2 Alternative: Can load in previous work, and then read in the merge_metadata.json to get the ids_to_replace

In [24]:
mergeable = pd.read_csv("before_merge.csv", index_col = "id")

  mergeable = pd.read_csv("before_merge.csv", index_col = "id")


In [25]:
with open("merge_metadata.json", "r") as f:
    merge_metadata = json.load(f)

In [26]:
replacement_indices = []
for e in merge_metadata:
    replacement_indices.append([e["merge"], e["to"]])

# Development area:

In [22]:
ids_to_replace, replace_id = replacement_indices[10]

In [None]:
def infer_in_batch(data, batch_size = 16):
    result = []
    for i in range(0, len(data), batch_size):
        result += neural_parscit.infer.infer_batch(data[i: i + batch_size])['seq_label']
    return result

In [51]:
curr_designation = pd.Series(mergeable.loc[ids_to_replace, "designation"])
merged_designation = mergeable.loc[ids_to_replace, "designation_first"].iloc[0]

exact_reference = curr_designation.str.replace(merged_designation, "", regex = False)\
    .str.replace(r'(?:[A-Z][a-z]+[,\s&]*|[A-Z]\.),?', "", regex = True) \
    
year_info = exact_reference.str.findall(r'\((\d{4})\)').str[0]
year_info = year_info[year_info.notna()]

exact_reference = exact_reference.str.replace(r'\((\d{4})\)', "", regex = True).str.strip('., \n\t')

mergeable.loc[year_info.index, "year"] = pd.Series(year_info)

In [42]:
year_info

id
298732    1929
376032    1929
396790    1929
Name: designation, dtype: object

In [49]:
mergeable.loc[year_info.index, "year"] = pd.Series(year_info)

# End of development area

In [None]:
def infer_in_batch(data):
    batch_size = 32 if data.str.len().max() <= 5000 else 1
    data = data.to_numpy()
    with torch.no_grad():
        result = []
        for i in range(0, len(data), batch_size):
            result += neural_parscit.infer.infer_batch(data[i: i + batch_size])['seq_label']
        return result

In [None]:
def replace(ids_to_replace, replace_id):

    if not any([(id in mergeable.index) for id in ids_to_replace]):
        print("Id not found.")
        return

    curr_designation = pd.Series(mergeable.loc[ids_to_replace, "designation"])
    merged_designation = mergeable.loc[ids_to_replace, "designation_first"].iloc[0]
    curr_designation = curr_designation.str.replace(merged_designation, "").str.strip('., \n\t')

    if all(curr_designation == ""):
        mergeable.drop(mergeable.index[mergeable.index.isin(ids_to_replace)], inplace = True)
        mergeable.loc[replace_id, "designation"] = mergeable.loc[replace_id, "designation_first"]
        return
    
    parsed_designation = infer_in_batch(curr_designation)

    exact_ref = []

    for id_to_replace, des_entry, parsed in zip(ids_to_replace, curr_designation, parsed_designation):
        parsed_split = parsed.replace("<PAD>", "").split()
        entry_split = des_entry.split()

        e_ref = ""
        for parsed_token, entry_token in zip(parsed_split, entry_split):
            if not parsed_token in ["author", "date", "booktitle", "editor",
                                    "institution", "publisher", "title"]:
                e_ref += entry_token + " "
        
        artifs_pubs.loc[artifs_pubs["publication_id"].isin([id_to_replace]), "exact_reference"] = e_ref.strip()
        artifs_pubs.loc[artifs_pubs["publication_id"].isin([id_to_replace]), "publication_id"] = replace_id

    mergeable.drop(mergeable.index[mergeable.index.isin(ids_to_replace)], inplace = True)
    mergeable.loc[replace_id, "designation"] = mergeable.loc[replace_id, "designation_first"]

In [27]:
# Old version, no ML involved
# This function cleans the exact_reference entry and does the appropriate merging.

def clean_entries(indices_involved):
    
    curr_designation = pd.Series(mergeable.loc[indices_involved, "designation"])
    merged_designation = mergeable.loc[indices_involved, "designation_first"].iloc[0]

    year_pattern = r'(?:\(|\[)(\d{4})(?:\)|\])'

    # Extract all of the information that we want to extract
    year_info = curr_designation.str.findall(r'\((\d{4})\)').str[0]
    year_info = year_info[year_info.notna()]

    # Now we start taking the irrelevant information away from the exact reference
    # Here it takes away the author names and the year
    exact_reference = curr_designation.str.replace(merged_designation, "", regex = False)\
        .str.replace(r'(?:[A-Z][a-z]+[,\s&]*|[A-Z]\.),?', "", regex = True) \
        .str.replace(year_pattern, "", regex = True).str.strip('., \n\t')

    # Fill in all relevant information to the corresponding columns here
    mergeable.loc[year_info.index, "year"] = year_info.to_numpy()

    # Update the artifacts_publications table here
    for id_to_update in indices_involved:
        if all(artifs_pubs.loc[artifs_pubs["publication_id"].isin([id_to_update]), "exact_reference"].notna()):
            continue
        artifs_pubs.loc[artifs_pubs["publication_id"].isin([id_to_update]), "exact_reference"] = exact_reference[id_to_update]
    

def replace(ids_to_replace, replace_id):
    if not any([(id in mergeable.index) for id in ids_to_replace]):
        print("Id not found.")
        return
        
    indices_involved = ids_to_replace + [replace_id]
    clean_entries(indices_involved)
    
    # Replace the ids in artifacts_publications here
    for id_to_replace in ids_to_replace:
        artifs_pubs.loc[artifs_pubs["publication_id"].isin([id_to_replace]), "publication_id"] = replace_id
        
    # Update the mergeable table
    mergeable.drop(mergeable.index[mergeable.index.isin(ids_to_replace)], inplace = True)

In [28]:
def store_progress(progress_idx):
    mergeable.to_csv("mergeable_wip.csv")
    artifs_pubs.to_csv("artifs_pubs_wip.csv")
    with open("replace_idx.txt", "w") as f:
        f.write(str(progress_idx))

In [29]:
def retrieve_progress():
    global mergeable, artifs_pubs
    mergeable = pd.read_csv("mergeable_wip.csv", index_col = "id")
    artifs_pubs = pd.read_csv("artifs_pubs_wip.csv")
    prgrs = 0
    with open("replace_idx.txt", "r") as f:
        prgrs = int(f.readline())
    print(f'Progress retrieved on index {prgrs}')
    return prgrs

In [30]:
prgrs = 0

In [113]:
prgrs = retrieve_progress()

  if (await self.run_code(code, result,  async_=asy)):


Progress retrieved on index 624


  if (await self.run_code(code, result,  async_=asy)):


In [31]:
for ids_to_replace, replace_id in replacement_indices[prgrs:]:

    replace(ids_to_replace, replace_id) 
    if prgrs % 100 == 0:
        print(f"{prgrs} / {len(replacement_indices)}")
        store_progress(prgrs)
    prgrs += 1

0 / 1514
100 / 1514
200 / 1514
300 / 1514
400 / 1514
500 / 1514
600 / 1514
700 / 1514
800 / 1514
900 / 1514
1000 / 1514
1100 / 1514
1200 / 1514
1300 / 1514
1400 / 1514
1500 / 1514


In [46]:
mergeable["designation"] = mergeable["designation_first"]

# Here we apply some additional manual fixes not accounted for by the program

In [47]:
with open("corrections.json", "r") as f:
    fixes = json.load(f)

In [48]:
mergeable.drop(mergeable.index[mergeable.index.isin(fixes["special"])], inplace = True)
mergeable

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
1131,MSVO 1,1991-1131,1991,,,,,,,,...,,,,,,,820,1,MSVO 1,MSVO
1138,ZA 072,1982-1138,1982,,,,,,,,...,,,,,,,820,1,ZA 072,ZA
1139,ATU 5,1994-1139,1994,,,,,,,,...,,,,,,,820,1,ATU 5,ATU
1393,ATU 6,2005-1393,2005,,,,,,,,...,,,,,,,820,1,ATU 6,ATU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240180,OrNS 37,1968-240180,1968,,,,,,,,...,,,,,,,820,1,OrNS 37,OrNS
240184,ZA 036,1925-240184,1925,,,,,,,,...,,,,,,,820,1,ZA 036,ZA
240191,JRAS 1921,1921-240191,1921,,,,,,,,...,,,,,,,820,1,JRAS 1921,JRAS
240193,JCS 73,2021-240193,2021,,,,,,,,...,,,,,,,820,1,JCS 73,JCS


In [49]:
for merge_dict in fixes['additional_merge']:
    ids_to_replace, replace_id = merge_dict['merge'], merge_dict['to']
    replace(ids_to_replace, replace_id)

Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.
Id not found.


In [50]:
merged_with_join = mergeable.merge(artifs_pubs, how = 'inner', left_index = True, right_on = 'publication_id')

In [51]:
artifacts = merged_with_join["artifact_id"]
pubs = merged_with_join["publication_id"]
exact_ref = merged_with_join["exact_reference"]

merged_with_join = merged_with_join.drop(columns = ["artifact_id", "publication_id", "exact_reference"])
merged_with_join.insert(0, "artifact_id", artifacts)
merged_with_join.insert(0, "publication_id", pubs)
merged_with_join.insert(5, "exact_reference", exact_ref)


# Now we write the csv to the file

In [52]:
mergeable.to_csv("merged_pubs.csv")
merged_with_join.to_csv("merged_pubs_with_join.csv", index = False)