In [11]:
import pandas as pd
import numpy as np
import json

Here we read in data downloaded as CSV form from ``cdli_db``.

In [124]:
pubs = pd.read_csv("publications.csv", index_col = "id")
artifs_pubs = pd.read_csv("artifacts_publications.csv")
abbrev = pd.read_csv("abbreviations.csv")

  pubs = pd.read_csv("publications.csv", index_col = "id")
  artifs_pubs = pd.read_csv("artifacts_publications.csv")


# Step 1: Identify subsets of publications that may be curated.

A lot of fields have the form like "ATU 3, pl. 036, W 12139", where "ATU 3" is an assyriological abbreviation and what goes
after the comma is exact_reference.

We can pull out all the publications that has such "assyriological abbreviation".

So first we split the designation column by a comma and take what goes before the first comma. Name it "designation_first".
Then we pick all rows where there is a numeric character in "designation_first".

In [125]:
des_first = pubs["designation"].fillna("").str.findall(r'\w+ \d+').str[0].fillna("")
# des_first = pubs["designation"].fillna("").str.split(",", n = 1, expand = True)[0]
pubs["designation_first"] = des_first
mergeable = pubs[pubs["designation_first"].str.match(".*\d.*")]


We assume these rows are formatted as "abbrev name + number", so extract the abbrev name.

Then we check if these names exist in the abbreviations table

Hopefully this can isolate some publications which may be merged.

In [126]:
mergeable_abbrevs = mergeable["designation_first"].str.split(" ", n = 1, expand = True)[0].to_numpy()
mergeable.loc[:, "designation_abbrev"] = mergeable_abbrevs

all_abbrevs = abbrev["abbreviation"].str.replace("\s\(aka.*\)", "").str.strip()
mergeable = mergeable[mergeable["designation_abbrev"].isin(all_abbrevs)]
mergeable.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergeable.loc[:, "designation_abbrev"] = mergeable_abbrevs
  all_abbrevs = abbrev["abbreviation"].str.replace("\s\(aka.*\)", "").str.strip()


Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
468,CUSAS 35,Bartash2017,2017,,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU


We can perform another round of filtering by removing designations with 1 entry. There's no point for merging them at this point 

In [127]:
def filter_func(df):
    return df.shape[0] > 1

mergeable = mergeable.groupby("designation_first").filter(filter_func)
mergeable.to_csv("before_merge.csv")
mergeable

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
468,CUSAS 35,Bartash2017,2017,,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000898,"RA 98, 148",1000898,,,,,,,,,...,,,,,,,820,1,RA 98,RA
1000901,"RA 97, 171",1000901,,,,,,,,,...,,,,,,,820,1,RA 97,RA
1013101,"RA 76, 011",1013101,,,,,,,,,...,,,,,,,820,1,RA 76,RA
1013102,"RA 76, 011",1013102,,,,,,,,,...,,,,,,,820,1,RA 76,RA


In [128]:
unchanged = pubs[~pubs.index.isin(mergeable.index)]
unchanged

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,publisher,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Bulla,Abusch1981,1981,7.0,Winona Lake,,In Honor of Ernest R. Lacheman on His Seventy-...,,,,...,Eisenbrauns,,Notes on a Pair of Matching Texts: A Shepherd'...,1,,SCCNH,,820,1,
2,,AbuschSchwemer2011,,2.0,,,Corpus of Mesopotamian Anti-Witchcraft Rituals...,,,,...,Brill: Leiden,,,8,,Ancient Magic and Divination,,820,1,
3,,Alizadeh2008,,2.0,,,Chogha Mish II: The Development of a Prehistor...,,,,...,The Oriental Institute: Chicago,,,130,,OIP,,820,1,
4,,Allred2006,,15.0,,,Cooks and Kitchens: Centralized Food Productio...,,,,...,,Johns Hopkins University,,,,,,820,1,
5,Girsu Labor,Allred2008,2008,7.0,,,On the Third Dynasty of Ur: Studies in Honor o...,,,,...,The American Schools of Oriental Research,,Labor Assignments from the City of Girsu,1,,The Journal of Cuneiform Studies Supplemental ...,,820,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952597,"Boiy 2003, 23",952597,,,,,,,,,...,,,,,,,,820,1,Boiy 2003
952599,"Boiy 2003, 31",952599,,,,,,,,,...,,,,,,,,820,1,Boiy 2003
952614,"Van der Spek, 1995, 214-215",952614,,,,,,,,,...,,,,,,,,820,1,
952661,"Krückmann, BRVU, 62",952661,,,,,,,,,...,,,,,,,,820,1,


# Step 2: Try to merge some of them.

This one goes through all unique values in the ``mergeable["designation_first"]`` column. However most have nothing to replace since they are just null.

In [130]:
# This part is added after Adam's proofreading

# Fixed some of the failures of mis-identifying designation_first
# The program will go through every element in ``find_and_fix``, for every element...
# The rows whose designation contains that element will have their designation_first field will be changed
find_and_fix = ["AOS 32", "Arch. 79", "AAT", "BBR", "BTT", "BWL", "CHJ", "CIRPL",
    "Dreams", "EDATS", "EEN", "FAOS 19", "Fs Kraus", "Fs Leichty",
    "Fs Lenoble", "Fs Matous", "Fs Owen", "Fs Pettinato", "Fs Sachs",
    "Fs Sigrist", "Fs Steve", "GCCI", "ITT 1", "ITT 2", "ITT 5", "KAR",
    "KAV", "Kish", "LKU", "MAD 1", "MCT", "MDP 31", "MEE 03", "MSKH",
    "MSL 04", "MSL 10", "MSL 11", "MSL 13", "MSL 14", "MSL 17", "MSL 4",
    "MSL 5", "MSL 6", "MSL 7", "MSL 8/1", "MSL 8/2", "MSL 9", "MSL SS",
    "MVAG 8/5", "NATN", "NFT", "Nisaba 04", "Nisaba 05", "Nisaba 12",
    "NRV", "OIP 138", "OrSP 06", "Phoenix 23", "Phoenix Ancient Art",
    "Proverbs", "PRT", "RIMA 1", "RIMA 2", "RIMA 3", "RIMB 1", "RIMB 2",
    "RIME 1", "RIME 2", "RIME 3", "RIME 4", "RINAP 1", "RINAP 3",
    "RINAP 4", "RINAP 5", "RMA", "RSO 05", "RT 22", "SBH", "SET", "SLFN",
    "TCND", "TCNU", "TJA", "TJDB", "TMH 2-3", "TMN", "UCP 09-02", "UNT",
    "VS 01", "YNER 4"]

for e in find_and_fix:
    mergeable.loc[mergeable["designation"].str.contains(e), "designation_first"] = e

In [134]:
replacement_indices = []
for abbrev_name in mergeable["designation_first"].unique():

    # Has same name
    has_same_name = mergeable["designation_first"] == abbrev_name

    # Find all entries with that abbreviation name
    subset = mergeable[has_same_name]

    # Skip if only 1
    if subset.shape[0] == 1:
        continue

    # Split into entries to be replaced (to_replace), and what they will be changed to (replace_with)
    # Prioritize those with a book title or a title as the "representative"
    represent = subset[~subset["book_title"].isna() | ~subset["title"].isna()]

    # If no "representative" exist, just choose the first one
    if represent.shape[0] < 1:
        represent = subset.iloc[0:1]
    represent = represent.iloc[0:1]
    assert represent.shape[0] > 0
    
    # Fetch the id these publications will be changed to
    replace_id = represent.index.item()

    # Fetch the id of the publications to be replaced
    ids_to_replace = subset.index[~subset.index.isin([replace_id])]

    # Append the indices to replace to the correct places
    replacement_indices.append([ids_to_replace, replace_id])

In [137]:
def replace(ids_to_replace, replace_id):

    if not any([(id in mergeable.index) for id in ids_to_replace]):
        print("Id not found.")
        return
        
    curr_designation = pd.Series(mergeable.loc[ids_to_replace, "designation"])
    merged_designation = mergeable.loc[ids_to_replace, "designation_first"].iloc[0]

    # Remove all merged designation from the current designation
    exact_reference = curr_designation.str.replace(merged_designation, "").str.strip('., \n\t')

    # Update the artifacts_publications table
    for id_to_replace in ids_to_replace:
        artifs_pubs.loc[artifs_pubs["publication_id"].isin([id_to_replace]), "exact_reference"] = exact_reference[id_to_replace]
        artifs_pubs.loc[artifs_pubs["publication_id"].isin([id_to_replace]), "publication_id"] = replace_id
    
    # Update the mergeable table
    mergeable.drop(mergeable.index[mergeable.index.isin(ids_to_replace)], inplace = True)
    mergeable.loc[replace_id, "designation"] = mergeable.loc[replace_id, "designation_first"]

In [138]:
for ids_to_replace, replace_id in replacement_indices:

    replace(ids_to_replace, replace_id)

  exact_reference = curr_designation.str.replace(merged_designation, "").str.strip('., \n\t')


# Here we apply some additional manual fixes not accounted for by the program

In [139]:
with open("corrections.json", "r") as f:
    fixes = json.load(f)

In [140]:
mergeable.drop(mergeable.index[mergeable.index.isin(fixes["special"])], inplace = True)
mergeable

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
468,CUSAS 35,Bartash2017,2017,,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
1131,MSVO 1,1991-1131,1991,,,,,,,,...,,,,,,,820,1,MSVO 1,MSVO
1138,ZA 072,1982-1138,1982,,,,,,,,...,,,,,,,820,1,ZA 072,ZA
1139,ATU 5,1994-1139,1994,,,,,,,,...,,,,,,,820,1,ATU 5,ATU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541325,JEOL 15,541325,,,,,,,,,...,,,,,,,820,1,JEOL 15,JEOL
557486,JNES 03,557486,,,,,,,,,...,,,,,,,820,1,JNES 03,JNES
685074,OLZ 22,685074,,,,,,,,,...,,,,,,,820,1,OLZ 22,OLZ
747961,Syria 52,747961,,,,,,,,,...,,,,,,,820,1,Syria 52,Syria


In [141]:
for merge_dict in fixes['additional_merge']:
    ids_to_replace, replace_id = merge_dict['merge'], merge_dict['to']
    replace(ids_to_replace, replace_id)

Id not found.
Id not found.
Id not found.
Id not found.
Id not found.


In [142]:
merged_with_join = mergeable.merge(artifs_pubs, how = 'inner', left_index = True, right_on = 'publication_id')

In [143]:
artifacts = merged_with_join["artifact_id"]
pubs = merged_with_join["publication_id"]
exact_ref = merged_with_join["exact_reference"]

merged_with_join = merged_with_join.drop(columns = ["artifact_id", "publication_id", "exact_reference"])
merged_with_join.insert(0, "artifact_id", artifacts)
merged_with_join.insert(0, "publication_id", pubs)
merged_with_join.insert(5, "exact_reference", exact_ref)


In [144]:
mergeable.to_csv("merged_pubs.csv")
merged_with_join.to_csv("merged_pubs_with_join.csv", index = False)

# Here's the code that generates the autocompleted merging

In [123]:
for e in replacement_indices:
    e[0] = list(e[0])

json_dump = []
for e in replacement_indices:
    json_dump.append({"merge" : e[0],"to" : e[1]})
    
with open("merge_metadata.json", "w") as f:
    f.write(json.dumps(json_dump, indent = 4))