In [56]:
import pandas as pd
import numpy as np

Here we read in data downloaded as CSV form from ``cdli_db``.

In [105]:
pubs = pd.read_csv("publications.csv", index_col = "id")
artifs_pubs = pd.read_csv("artifacts_publications.csv")
abbrev = pd.read_csv("abbreviations.csv")

  pubs = pd.read_csv("publications.csv", index_col = "id")
  artifs_pubs = pd.read_csv("artifacts_publications.csv")


# Step 1: Identify subsets of publications that may be curated.

A lot of fields have the form like "ATU 3, pl. 036, W 12139", where "ATU 3" is an assyriological abbreviation and what goes
after the comma is exact_reference.

We can pull out all the publications that has such "assyriological abbreviation".

So first we split the designation column by a comma and take what goes before the first comma. Name it "designation_first".
Then we pick all rows where there is a numeric character in "designation_first".

In [106]:
des_first = pubs["designation"].fillna("").str.split(",", n = 1, expand = True)[0]
pubs["designation_first"] = des_first
mergeable = pubs[pubs["designation_first"].str.match(".*\d.*")]


We assume these rows are formatted as "abbrev name + number", so extract the abbrev name.

Then we check if these names exist in the abbreviations table

Hopefully this can isolate some publications which may be merged.

In [107]:
mergeable_abbrevs = mergeable["designation_first"].str.split(" ", n = 1, expand = True)[0]
mergeable.loc[:, "designation_abbrev"] = mergeable_abbrevs

all_abbrevs = abbrev["abbreviation"].str.replace("\s\(aka.*\)", "").str.strip()
mergeable = mergeable[mergeable["designation_abbrev"].isin(all_abbrevs)]
mergeable.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergeable.loc[:, "designation_abbrev"] = mergeable_abbrevs
  all_abbrevs = abbrev["abbreviation"].str.replace("\s\(aka.*\)", "").str.strip()


Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
468,CUSAS 35,Bartash2017,2017,,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU


We can perform another round of filtering by removing designations with 1 entry. There's no point for merging them at this point 

In [108]:
def filter_func(df):
    return df.shape[0] > 1

mergeable = mergeable.groupby("designation_first").filter(filter_func)
mergeable.to_csv("before_merge.csv")
mergeable

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
468,CUSAS 35,Bartash2017,2017,,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,,...,,,,,,,820,1,ATU 3,ATU
483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,,...,,,,,,,820,1,ATU 3,ATU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000896,"RA 98, 148",1000896,,,,,,,,,...,,,,,,,820,1,RA 98,RA
1000898,"RA 98, 148",1000898,,,,,,,,,...,,,,,,,820,1,RA 98,RA
1000901,"RA 97, 171",1000901,,,,,,,,,...,,,,,,,820,1,RA 97,RA
1013101,"RA 76, 011",1013101,,,,,,,,,...,,,,,,,820,1,RA 76,RA


In [109]:
unchanged = pubs[~pubs.index.isin(mergeable.index)]
unchanged

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,publisher,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Bulla,Abusch1981,1981,7.0,Winona Lake,,In Honor of Ernest R. Lacheman on His Seventy-...,,,,...,Eisenbrauns,,Notes on a Pair of Matching Texts: A Shepherd'...,1,,SCCNH,,820,1,Bulla
2,,AbuschSchwemer2011,,2.0,,,Corpus of Mesopotamian Anti-Witchcraft Rituals...,,,,...,Brill: Leiden,,,8,,Ancient Magic and Divination,,820,1,
3,,Alizadeh2008,,2.0,,,Chogha Mish II: The Development of a Prehistor...,,,,...,The Oriental Institute: Chicago,,,130,,OIP,,820,1,
4,,Allred2006,,15.0,,,Cooks and Kitchens: Centralized Food Productio...,,,,...,,Johns Hopkins University,,,,,,820,1,
5,Girsu Labor,Allred2008,2008,7.0,,,On the Third Dynasty of Ur: Studies in Honor o...,,,,...,The American Schools of Oriental Research,,Labor Assignments from the City of Girsu,1,,The Journal of Cuneiform Studies Supplemental ...,,820,1,Girsu Labor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955152,"Kramer JAOS 060, 240",955152,,,,,,,,,...,,,,,,,,820,1,Kramer JAOS 060
990581,"Notizia, Palmiro, NABU 2018/001",990581,,,,,,,,,...,,,,,,,,820,1,Notizia
1000531,"RA 28, 124",1000531,,,,,,,,,...,,,,,,,,820,1,RA 28
1113429,"Woestenburg, Els, AfO 44/45 (1997/1998) 354",1113429,,,,,,,,,...,,,,,,,,820,1,Woestenburg


# Step 2: Try to merge some of them.

This one goes through all unique values in the ``mergeable["designation_first"]`` column. However most have nothing to replace since they are just null.

In [110]:
cant_replace = []
for abbrev_name in mergeable["designation_first"].unique():

    # Has same name
    has_same_name = mergeable["designation_first"] == abbrev_name

    # Find all entries with that abbreviation name
    subset = mergeable[has_same_name]

    # Split into entries to be replaced (to_replace), and what they will be changed to (replace_with)
    to_replace, replace_with = subset[subset["book_title"].isna() & subset["title"].isna()], subset[~subset["book_title"].isna() | ~subset["title"].isna()]

    if replace_with.shape[0] < 1:
        to_replace, replace_with = subset.iloc[1:], subset.iloc[0:]
    
    # Treat what goes after the first comma in designation as exact_referece
    # Deal with designation with and without commas separately
    exact_reference = pd.Series(to_replace["designation"])
    has_comma = exact_reference.str.contains(",")
    if any(has_comma):
        exact_reference[has_comma] = exact_reference[has_comma].str.split(",", 1, True)[1]
    exact_reference[~has_comma] = ""

    # Fetch the id of the publications to be replaced
    ids_to_replace = to_replace.index

    # Fetch the id these publications will be changed to
    replace_id = replace_with.index[0]

    # Update the artifacts_publications table
    if not any(artifs_pubs["publication_id"].isin(ids_to_replace)):
        print("No corresponding entries in artifacts_publications.")
    else:
        artifs_pubs.loc[artifs_pubs["publication_id"].isin(ids_to_replace), "exact_reference"] = exact_reference
        artifs_pubs.loc[artifs_pubs["publication_id"].isin(ids_to_replace), "publication_id"] = replace_id
        artifs_pubs.head()
    
    # Update the mergeable table
    mergeable.loc[has_same_name, "designation"] = abbrev_name
    mergeable.drop(mergeable.index[mergeable.index.isin(ids_to_replace)], inplace = True)
    print("Merged")

Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged
Merged

In [111]:
mergeable.drop(columns = ["designation_first", "designation_abbrev"]).to_csv("merged_pubs.csv")

In [84]:
unchanged

Unnamed: 0_level_0,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,edition,...,publisher,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Bulla,Abusch1981,1981,7.0,Winona Lake,,In Honor of Ernest R. Lacheman on His Seventy-...,,,,...,Eisenbrauns,,Notes on a Pair of Matching Texts: A Shepherd'...,1,,SCCNH,,820,1,Bulla
2,,AbuschSchwemer2011,,2.0,,,Corpus of Mesopotamian Anti-Witchcraft Rituals...,,,,...,Brill: Leiden,,,8,,Ancient Magic and Divination,,820,1,
3,,Alizadeh2008,,2.0,,,Chogha Mish II: The Development of a Prehistor...,,,,...,The Oriental Institute: Chicago,,,130,,OIP,,820,1,
4,,Allred2006,,15.0,,,Cooks and Kitchens: Centralized Food Productio...,,,,...,,Johns Hopkins University,,,,,,820,1,
5,Girsu Labor,Allred2008,2008,7.0,,,On the Third Dynasty of Ur: Studies in Honor o...,,,,...,The American Schools of Oriental Research,,Labor Assignments from the City of Girsu,1,,The Journal of Cuneiform Studies Supplemental ...,,820,1,Girsu Labor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000901,"RA 97, 171",1000901,,,,,,,,,...,,,,,,,,820,1,RA 97
1013101,"RA 76, 011",1013101,,,,,,,,,...,,,,,,,,820,1,RA 76
1013102,"RA 76, 011",1013102,,,,,,,,,...,,,,,,,,820,1,RA 76
1113429,"Woestenburg, Els, AfO 44/45 (1997/1998) 354",1113429,,,,,,,,,...,,,,,,,,820,1,Woestenburg
