In [1]:
import pandas as pd

In [2]:
overdrive_fh = "./reports/NYPL/od-nyp-all.csv"

## Overdrive bulk MarcExpress dataframe

In [3]:
odf = pd.read_csv(overdrive_fh, header=None, names=["overdriveNo", "overdriveControlNo", "sierraFormat_ext"])

In [4]:
odf.shape

(246474, 3)

In [5]:
odf.head()

Unnamed: 0,overdriveNo,overdriveControlNo,sierraFormat_ext
0,C2E25112-2EC8-4088-9D18-160AD882917F,ODN0000149034,3
1,D42B7FB1-F40F-4D43-A77E-35A63B028A19,ODN0000149045,3
2,2AA05401-0C51-4E57-BDB5-81007BF87C97,ODN0000149054,3
3,5D74233B-39AE-4D57-A5DE-D07C1FAD0647,ODN0000149055,3
4,FADF579F-E299-4780-8BDD-CA9E1142743D,ODN0000149056,3


In [6]:
# find if any missing IDs
odf_missing_ids = odf[odf["overdriveNo"].isnull()]

In [7]:
odf_missing_ids.shape

(0, 3)

In [8]:
odf["sierraFormat_ext"].value_counts()

z    192561
n     51953
3      1960
Name: sierraFormat_ext, dtype: int64

## Sierra Overdrive bibs dataframe

In [9]:
sierra_fh = "./reports/NYPL/sierra-nyp-all.csv"

In [10]:
sdf = pd.read_csv(
    sierra_fh,
    header=None,
    names=["overdriveNo", "overdriveNoSrc", "bibNo", "sierraControlNo", "sierraControlNoSrc", "sierraFormat_int", "sierra_status"],
    low_memory=False
)

In [11]:
sdf.shape

(229930, 7)

In [12]:
sdf.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status
0,5E7A6766-4D4A-4564-86F2-481DE9473CD0,37,b170902584,61227205,OCoLC,z,-
1,5E5DDBB5-7855-4F28-B893-7F6611E8CEE1,37,b17094783x,69187499,OCoLC,z,-
2,5F813F22-EA16-4ED2-BEB9-B1825812B29A,37,b17101876x,52514399,OCoLC,z,-
3,9912837E-54CA-49B7-AC94-4C7E6B63056C,37,b171018825,52515141,OCoLC,z,-
4,57471296-F6D1-4A55-BF96-40095428284F,37,b171018850,52515457,OCoLC,z,-


In [13]:
sdf["overdriveNoSrc"].value_counts()

37    229930
Name: overdriveNoSrc, dtype: int64

In [14]:
sdf["sierraFormat_int"].value_counts()

z    182514
n     47250
3       166
Name: sierraFormat_int, dtype: int64

### Sierra bibs missing proper Overdrive ID

In [15]:
sdf_no_overdriveNo = sdf[(sdf["overdriveNo"].isnull())|(sdf["overdriveNo"] == "")]

In [16]:
sdf_no_overdriveNo.shape

(0, 7)

In [17]:
sdf_invalid_overdriveNo = sdf[(sdf["overdriveNo"].isnull())|(sdf["overdriveNo"].str.match("(?!.*-.*-.*.*)"))]

In [18]:
sdf_invalid_overdriveNo.shape

(0, 7)

## Inner join between Sierra and bulk MarcExpress - records already in the catalog

In [19]:
df_available = pd.merge(sdf, odf, on="overdriveNo", how="inner")

In [20]:
df_available.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext
0,5E7A6766-4D4A-4564-86F2-481DE9473CD0,37,b170902584,61227205,OCoLC,z,-,ODN0000074679,z
1,5E5DDBB5-7855-4F28-B893-7F6611E8CEE1,37,b17094783x,69187499,OCoLC,z,-,ODN0000094989,z
2,5F813F22-EA16-4ED2-BEB9-B1825812B29A,37,b17101876x,52514399,OCoLC,z,-,ODN0000015898,z
3,9912837E-54CA-49B7-AC94-4C7E6B63056C,37,b171018825,52515141,OCoLC,z,-,ODN0000006985,z
4,57471296-F6D1-4A55-BF96-40095428284F,37,b171018850,52515457,OCoLC,z,-,ODN0000000555,z


In [21]:
df_available.shape

(214147, 9)

In [22]:
# but the results above includes duplicate bibs (multi overdriveNo!); need to dedup to get correct count

In [23]:
df_found_deduped_on_bibNo = df_available.drop_duplicates(subset="bibNo", keep="first").copy()

In [24]:
df_found_deduped_on_bibNo.shape

(213899, 9)

In [25]:
df_found_deduped_on_overdriveNo = df_available.drop_duplicates(subset="overdriveNo", keep="first").copy()

In [26]:
df_found_deduped_on_overdriveNo.shape

(212389, 9)

In [27]:
# why discrepancy in numbers?
# only logical explanation is because we have duplicates in Sierra!

In [29]:
df_found_deduped_on_bibNo.to_csv("./reports/NYPL/found-in-catalog.csv", index=False)

In [30]:
df_found_deduped_on_overdriveNo.to_csv("./reports/NYPL/found-in-catalog-unique-overdriveNos.csv", index=False)

### Suppressed/marked for deletion bibs in Sierra to be reinstated

In [31]:
# find records that are in the catalog, users have access to, but are suppressed
df_found_deduped_on_bibNo["sierra_status"].value_counts()

-    116684
a     95981
d      1229
n         3
h         1
t         1
Name: sierra_status, dtype: int64

In [32]:
df_reinstate_access = df_found_deduped_on_bibNo[(df_found_deduped_on_bibNo["sierra_status"]=="n")|(df_found_deduped_on_bibNo["sierra_status"]=="d")]

In [33]:
df_reinstate_access.to_csv("./reports/NYPL/reinstate_access-alldata.csv", index=False)

In [35]:
df_reinstate_access["bibNo"].to_csv("./reports/NYPL/reinstate_access-bibNos", index=False)

In [36]:
# access should be verified because the backdated file is not accurate - found multiple instances where there is no actual access given/resource is 
# removed from the Overdrive platform


## Missing from Sierra bibs / Sierra and MarcExpress right anti join

In [37]:
mdf_join = pd.merge(sdf, odf, how="outer", on="overdriveNo", indicator=True)

In [38]:
mdf_join.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
0,5E7A6766-4D4A-4564-86F2-481DE9473CD0,37.0,b170902584,61227205,OCoLC,z,-,ODN0000074679,z,both
1,5E5DDBB5-7855-4F28-B893-7F6611E8CEE1,37.0,b17094783x,69187499,OCoLC,z,-,ODN0000094989,z,both
2,5F813F22-EA16-4ED2-BEB9-B1825812B29A,37.0,b17101876x,52514399,OCoLC,z,-,ODN0000015898,z,both
3,9912837E-54CA-49B7-AC94-4C7E6B63056C,37.0,b171018825,52515141,OCoLC,z,-,ODN0000006985,z,both
4,57471296-F6D1-4A55-BF96-40095428284F,37.0,b171018850,52515457,OCoLC,z,-,ODN0000000555,z,both


In [39]:
mdf_join.shape

(264015, 10)

In [40]:
df_missing = mdf_join[mdf_join["_merge"] == "right_only"]

In [41]:
df_missing.shape

(34085, 10)

In [42]:
df_missing.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
229930,C2E25112-2EC8-4088-9D18-160AD882917F,,,,,,,ODN0000149034,3,right_only
229931,D42B7FB1-F40F-4D43-A77E-35A63B028A19,,,,,,,ODN0000149045,3,right_only
229932,2AA05401-0C51-4E57-BDB5-81007BF87C97,,,,,,,ODN0000149054,3,right_only
229933,5D74233B-39AE-4D57-A5DE-D07C1FAD0647,,,,,,,ODN0000149055,3,right_only
229934,FADF579F-E299-4780-8BDD-CA9E1142743D,,,,,,,ODN0000149056,3,right_only


In [43]:
df_missing["sierraFormat_ext"].value_counts()

z    27072
n     5219
3     1794
Name: sierraFormat_ext, dtype: int64

In [44]:
df_missing.to_csv("./reports/NYPL/missing.csv", index=False)

In [45]:
df_missing["overdriveNo"].to_csv("./reports/NYPL/missing-overdriveNos.csv", index=False)

## overdriveNo not found in bulk MarcExpress records / left anti join

In [46]:
# do not include bibs without proper overdrive #
vsdf = sdf[(sdf["overdriveNo"].notnull())|(sdf["overdriveNo"].str.match("(.*-.*-.*-.*)"))]

In [47]:
vsdf.shape

(229930, 7)

In [48]:
vsdf.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status
0,5E7A6766-4D4A-4564-86F2-481DE9473CD0,37,b170902584,61227205,OCoLC,z,-
1,5E5DDBB5-7855-4F28-B893-7F6611E8CEE1,37,b17094783x,69187499,OCoLC,z,-
2,5F813F22-EA16-4ED2-BEB9-B1825812B29A,37,b17101876x,52514399,OCoLC,z,-
3,9912837E-54CA-49B7-AC94-4C7E6B63056C,37,b171018825,52515141,OCoLC,z,-
4,57471296-F6D1-4A55-BF96-40095428284F,37,b171018850,52515457,OCoLC,z,-


In [49]:
edf_join = pd.merge(vsdf, odf, how="outer", on="overdriveNo", indicator=True)

In [50]:
edf_join.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
0,5E7A6766-4D4A-4564-86F2-481DE9473CD0,37.0,b170902584,61227205,OCoLC,z,-,ODN0000074679,z,both
1,5E5DDBB5-7855-4F28-B893-7F6611E8CEE1,37.0,b17094783x,69187499,OCoLC,z,-,ODN0000094989,z,both
2,5F813F22-EA16-4ED2-BEB9-B1825812B29A,37.0,b17101876x,52514399,OCoLC,z,-,ODN0000015898,z,both
3,9912837E-54CA-49B7-AC94-4C7E6B63056C,37.0,b171018825,52515141,OCoLC,z,-,ODN0000006985,z,both
4,57471296-F6D1-4A55-BF96-40095428284F,37.0,b171018850,52515457,OCoLC,z,-,ODN0000000555,z,both


In [51]:
edf = edf_join[edf_join["_merge"] == "left_only"]

In [52]:
edf.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
480,9998F48A-D6AA-424F-8581-ECC0B5BA104F,37.0,b171032809,62381638,OCoLC,z,-,,,left_only
2147,2BB18901-645B-499E-A69E-A5854AAA94EB,37.0,b171847155,55955025,OCoLC,z,-,,,left_only
2179,8F63F9F7-AFD7-4398-AD69-22CEC5FD484D,37.0,b171848408,56019384,OCoLC,z,-,,,left_only
2633,1AC4120B-AF06-419F-BA9F-788D50CBBBD4,37.0,b172049295,123031686,OCoLC,z,-,,,left_only
3064,C077EBEE-FC30-4C03-B8E9-500FCCC805F7,37.0,b172142209,56906562,OCoLC,z,-,,,left_only


In [53]:
edf.shape

(15783, 10)

In [54]:
# but these rows includes multi overdriveNo on bib, so it needs to be compared to what was found to be overlaping (inner join); overdriveNo are irrelevant here
# lets dedup on bibNo this set

In [55]:
edf_deduped = edf.drop_duplicates(subset="bibNo", keep="first")

In [56]:
edf_deduped.shape

(15455, 10)

In [57]:
# lets see if any of these show up in the inner join; should be 0 overlap

In [58]:
vedf = pd.merge(df_found_deduped_on_bibNo, edf_deduped, on="overdriveNo", how="inner")

In [59]:
vedf.shape

(0, 18)

In [60]:
# Yay! but ...
# this must be verfied again against Overdrive platform to confirm the library has no access and can be deleted

In [61]:
edf_deduped.to_csv("./reports/NYPL/expired-before-verfication.csv", index=False)

In [62]:
edf_deduped[["overdriveNo", "bibNo"]].to_csv("./reports/NYPL/expired-before-verification-idsonly.csv", index=False)