In [1]:
import pandas as pd
import numpy as np

Here we read in data downloaded as CSV form from ``cdli_db``.

In [22]:
pubs = pd.read_csv("publications.csv")
artifs_pubs = pd.read_csv("artifacts_publications.csv")
abbrev = pd.read_csv("abbreviations.csv")

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


# Step 1: Identify subsets of publications that may be curated.

A lot of fields have the form like "ATU 3, pl. 036, W 12139", where "ATU 3" is an assyriological abbreviation and what goes
after the comma is exact_reference.

We can pull out all the publications that has such "assyriological abbreviation".

So first we split the designation column by a comma and take what goes before the first comma. Name it "designation_first".
Then we pick all rows where there is a numeric character in "designation_first".

In [66]:
des_first = pubs["designation"].fillna("").str.split(",", n = 1, expand = True)[0]
pubs["designation_first"] = des_first
mergeable = pubs[pubs["designation_first"].str.match(".*\d.*")]


We assume these rows are formatted as "abbrev name + number", so extract the abbrev name.

Then we check if these names exist in the abbreviations table

Hopefully this can isolate some publications which may be merged.

In [67]:
mergeable_abbrevs = mergeable["designation_first"].str.split(" ", n = 1, expand = True)[0]
mergeable.loc[:, "designation_abbrev"] = mergeable_abbrevs

all_abbrevs = abbrev["abbreviation"].str.replace("\s\(aka.*\)", "").str.strip()
mergeable = mergeable[mergeable["designation_abbrev"].isin(all_abbrevs)]
mergeable.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,id,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
467,468,CUSAS 35,Bartash2017,2017,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
468,472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,...,,,,,,,820,1,ATU 3,ATU
469,483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
470,487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
471,488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU


We can perform another round of filtering by removing designations with 1 entry. There's no point for merging them at this point 

In [68]:
def filter_func(df):
    return df.shape[0] > 1

mergeable = mergeable.groupby("designation_first").filter(filter_func)
mergeable

Unnamed: 0,id,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
467,468,CUSAS 35,Bartash2017,2017,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
468,472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,...,,,,,,,820,1,ATU 3,ATU
469,483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
470,487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
471,488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418179,1000896,"RA 98, 148",1000896,,,,,,,,...,,,,,,,820,1,RA 98,RA
418180,1000898,"RA 98, 148",1000898,,,,,,,,...,,,,,,,820,1,RA 98,RA
418181,1000901,"RA 97, 171",1000901,,,,,,,,...,,,,,,,820,1,RA 97,RA
418182,1013101,"RA 76, 011",1013101,,,,,,,,...,,,,,,,820,1,RA 76,RA


# Step 2: Try to merge some of them.

Here we try those with "ATU 3"

In [72]:
# Find all entries with that abbreviation name
abbrev_name = "CUSAS 35"
subset = mergeable[mergeable["designation_first"] == abbrev_name]

# Split into entries to be replaced (to_replace), and what they will be changed to (replace_with)
to_replace, replace_with = subset[subset["book_title"].isna() & subset["title"].isna()], subset[~subset["book_title"].isna() | ~subset["title"].isna()]
replace_with.head()

Unnamed: 0,id,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
467,468,CUSAS 35,Bartash2017,2017,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS


In [73]:
# Treat what goes after the first comma in designation as exact_referece
exact_reference = to_replace["designation"].str.split(",", 1, True)[1]
exact_reference.head()

75942     471
75943     472
75945     477
81944     474
81955     469
Name: 1, dtype: object

In [74]:
# Fetch the id of the publications to be replaced
ids_to_replace = to_replace["id"]
ids_to_replace.head()

# Fetch the id these publications will be changed to
replace_id = replace_with["id"].iloc[0]

In [78]:
ids_to_replace

75942      77374
75943      77375
75945      77391
81944      83445
81955      83456
           ...  
202653    221149
202655    221151
202656    221152
203609    222134
203610    222135
Name: id, Length: 521, dtype: int64

In [76]:
# Update the artifacts_publications table
if not any(artifs_pubs["publication_id"].isin(ids_to_replace)):
    print("No corresponding entries in artifacts_publications.")
else:
    artifs_pubs.loc[artifs_pubs["publication_id"].isin(ids_to_replace), "exact_reference"] = exact_reference.to_numpy()
    artifs_pubs.loc[artifs_pubs["publication_id"].isin(ids_to_replace), "publication_id"] = replace_id
    artifs_pubs.head()

No corresponding entries in artifacts_publications.


In [77]:
# Update the mergeable table
mergeable.drop(mergeable.index[mergeable["id"].isin(ids_to_replace)])

Unnamed: 0,id,designation,bibtexkey,year,entry_type_id,address,annote,book_title,chapter,crossref,...,school,title,volume,publication_history,series,oclc,accepted_by,accepted,designation_first,designation_abbrev
467,468,CUSAS 35,Bartash2017,2017,,,,,,,...,,Sumerian Administrative and Legal Documents ca...,,,,,820,1,CUSAS 35,CUSAS
468,472,ATU 3,Englund1993,1993,,,,Die lexikalischen Listen der Archaischen Texte...,,,...,,,,,,,820,1,ATU 3,ATU
469,483,"ATU 3, pl. 036, W 12139",1993-483,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
470,487,"ATU 3, pl. 080, W 13948",1993-487,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
471,488,"ATU 3, pl. 082, W 13982",1993-488,1993,,,,,,,...,,,,,,,820,1,ATU 3,ATU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418179,1000896,"RA 98, 148",1000896,,,,,,,,...,,,,,,,820,1,RA 98,RA
418180,1000898,"RA 98, 148",1000898,,,,,,,,...,,,,,,,820,1,RA 98,RA
418181,1000901,"RA 97, 171",1000901,,,,,,,,...,,,,,,,820,1,RA 97,RA
418182,1013101,"RA 76, 011",1013101,,,,,,,,...,,,,,,,820,1,RA 76,RA
