In [31]:
import pandas as pd

In [32]:
overdrive_fh  = "./reports/BPL/od-bpl-all.csv"

## Overdrive bulk MarcExpress dataframe

In [33]:
odf_raw = pd.read_csv(overdrive_fh, header=None, names=["overdriveNo", "overdriveControlNo", "sierraFormat_ext"])

In [34]:
odf_raw.head()

Unnamed: 0,overdriveNo,overdriveControlNo,sierraFormat_ext
0,0E90D7A5-30B8-4D07-9D13-DF0E02EA631E,ODN0000082587,z
1,C449DE3C-A683-412A-8997-F0551A5DC576,ODN0000082663,z
2,765A1B6A-E978-4777-8DD1-9647D611A58B,ODN0000082589,z
3,0957DA4E-DD2F-469E-BDFF-B9B1A9422C21,ODN0000082590,z
4,E572EC2C-09F7-462D-8153-7C8A7493586F,ODN0000082591,z


In [35]:
# find if any missing IDs
odf_missing_ids = odf_raw[odf_raw["overdriveNo"].isnull()]

In [36]:
odf_missing_ids.shape

(0, 3)

In [37]:
odf_raw["sierraFormat_ext"].value_counts()

x    171343
z     43806
v      1985
Name: sierraFormat_ext, dtype: int64

## Sierra Overdrive bibs dataframe

In [38]:
sierra_fh = "./reports/BPL/sierra-bpl-all.csv"

In [39]:
sdf_raw = pd.read_csv(
    sierra_fh,
    header=None,
    names=["overdriveNo", "overdriveNoSrc", "bibNo", "sierraControlNo", "sierraControlNoSrc", "sierraFormat_int", "sierra_status"],
    low_memory=False
)

In [40]:
sdf_raw.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status
0,27C3C666-343B-442B-A81C-2068E6D1A664,37,b112402306,ocm54908102,OCoLC,x,-
1,C1AC2A72-58D4-4611-9F80-00CFE226CF2C,37,b112402318,ocm55680353,OCoLC,x,-
2,8CD53ED9-CEBD-4F78-8BEF-20A58F6F3857,37,b11240232x,ocm56833247,OCoLC,x,-
3,CAC65044-3B9C-4B12-B80B-00D9A09ABAB8,37,b112402331,ocm57443259,OCoLC,x,-
4,3D783A28-2DA9-4798-A3C2-1F65D2208F38,37,b112402343,ocm60397130,OCoLC,x,-


In [41]:
sdf_raw["sierraFormat_int"].value_counts()

x    161347
z     41264
v       935
s        45
Name: sierraFormat_int, dtype: int64

In [42]:
# sierra fomat "s"?
sdf_s_format = sdf_raw[sdf_raw["sierraFormat_int"] == "s"]

In [43]:
sdf_s_format.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status
713,15B32E52-357A-4D74-A43E-18C673B0B0FC,url,b112467854,ocm57507474,OCoLC,s,-
714,EBC278AA-D4EA-45FB-8B3B-DFDF2122ADA0,url,b112467866,ocm57507588,OCoLC,s,-
715,35AC0D13-E37C-47E0-8DC9-DEEFC69D10DF,url,b112467878,ocm57507915,OCoLC,s,-
717,7B5DF2A5-6C03-495A-B98B-4A6B49D7BF6A,url,b112467891,ocm57508902,OCoLC,s,-
718,679C31DB-8F49-4A59-B697-C0CE35CDD490,url,b112467908,ocm57509118,OCoLC,s,-


#### Sierra bibs missing proper Overdrive ID

In [69]:
sdf_no_overdriveNo = sdf_raw[(sdf_raw["overdriveNo"].isnull())]

In [71]:
sdf_no_overdriveNo.shape

(52, 7)

In [74]:
sdf_invalid_overdriveNo = sdf_raw[(sdf_raw["overdriveNo"].isnull())|(sdf_raw["overdriveNo"].str.match("(?!.*-.*-.*.*)"))]

In [75]:
sdf_invalid_overdriveNo.shape

(63, 7)

In [77]:
sdf_invalid_overdriveNo.to_csv("./reports/BPL/sierra-invalid-overdriveNo.csv", index=False)

## Inner join between Sierra and bulk MarcExpress - records already in the catalog

In [44]:
# this looks like legitimate bibs that are marked differently in Sierra than on Overload platform

In [45]:
df_available = pd.merge(odf_raw, sdf_raw, on="overdriveNo", how="inner")

In [46]:
df_available.head()

Unnamed: 0,overdriveNo,overdriveControlNo,sierraFormat_ext,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status
0,0E90D7A5-30B8-4D07-9D13-DF0E02EA631E,ODN0000082587,z,url,b113910861,ocm63791051,OCoLC,z,-
1,C449DE3C-A683-412A-8997-F0551A5DC576,ODN0000082663,z,037,b122634597,ocm63680586,,z,-
2,765A1B6A-E978-4777-8DD1-9647D611A58B,ODN0000082589,z,url,b113851571,ocm63791673,OCoLC,z,-
3,0957DA4E-DD2F-469E-BDFF-B9B1A9422C21,ODN0000082590,z,037,b113026158,ocm66900955,OCoLC,z,-
4,E572EC2C-09F7-462D-8153-7C8A7493586F,ODN0000082591,z,037,b11302616x,ocm66900982,OCoLC,z,-


In [47]:
df_available.shape

(189051, 9)

#### Suppressed/marked for deletion bibs in Sierra to be reinstated

In [48]:
# find records that are in the catalog, users have access to, but are suppressed
df_available["sierra_status"].value_counts()

-    188902
n       127
d        22
Name: sierra_status, dtype: int64

In [49]:
df_reinstate_access = df_available[(df_available["sierra_status"]=="n")|(df_available["sierra_status"]=="d")]

In [50]:
df_reinstate_access.shape

(149, 9)

In [51]:
df_reinstate_access.head()

Unnamed: 0,overdriveNo,overdriveControlNo,sierraFormat_ext,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status
1660,31C31E3A-EE4F-499F-98F8-5987FBD715D1,ODN0003566156,z,037,b121446918,on1042160321,,z,n
3519,4D21F8F0-100A-4C25-941D-B5D480091625,ODN0003566139,z,037,b121390846,on1033674500,,z,n
6696,1C0B9DC4-653A-49F7-9187-2176F6BA859D,ODN0000060795,z,url,b112467064,ocm57398773,OCoLC,z,n
7209,A2D8D0EA-38D5-434E-994C-036393218ACD,ODN0000155769,z,037,b118440718,ocn455436062,OCoLC,z,d
8826,6F245F1F-05C7-4C5C-84C1-A57DE8E92A38,ODN0000063301,z,037,b117856587,ocm59755950,OCoLC,z,n


In [52]:
df_reinstate_access.to_csv("./reports/BPL/reinstate_access-alldata.csv", index=False)

In [53]:
df_reinstate_access["bibNo"].to_csv("./reports/BPL/reinstate_access-bibNos", index=False)

## Missing from Sierra bibs / Sierra and MarcExpress right anti join

In [56]:
df_right_join = pd.merge(sdf_raw, odf_raw, how="outer", on="overdriveNo", left_index=True, indicator=True)

In [57]:
df_right_join.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
54022.0,27C3C666-343B-442B-A81C-2068E6D1A664,37,b112402306,ocm54908102,OCoLC,x,-,ODN0000052705,x,both
108202.0,C1AC2A72-58D4-4611-9F80-00CFE226CF2C,37,b112402318,ocm55680353,OCoLC,x,-,ODN0000055232,x,both
176453.0,8CD53ED9-CEBD-4F78-8BEF-20A58F6F3857,37,b11240232x,ocm56833247,OCoLC,x,-,ODN0000060618,x,both
102534.0,CAC65044-3B9C-4B12-B80B-00D9A09ABAB8,37,b112402331,ocm57443259,OCoLC,x,-,ODN0000062533,x,both
48693.0,3D783A28-2DA9-4798-A3C2-1F65D2208F38,37,b112402343,ocm60397130,OCoLC,x,-,ODN0000069862,x,both


In [58]:
df_missing = df_right_join[df_right_join["_merge"] == "right_only"]

In [59]:
df_missing.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
25.0,D4B34CBB-80AC-4B53-9CE1-7019929155BA,,,,,,,ODN0000637972,z,right_only
30.0,35B73FE2-F9D9-463D-A41E-1EBF6C85F834,,,,,,,ODN0000213530,z,right_only
33.0,A4324F25-7F12-4488-8378-BD26C1B2CD34,,,,,,,ODN0000644843,z,right_only
37.0,E236C55B-D2F5-4E7F-B832-1B6E58205DA0,,,,,,,ODN0000637978,z,right_only
63.0,722370E6-1A28-444E-B58F-A21D5AD05C8B,,,,,,,ODN0000644847,z,right_only


In [60]:
df_missing.shape

(30549, 10)

In [63]:
df_missing["sierraFormat_ext"].value_counts()

x    26446
z     3017
v     1086
Name: sierraFormat_ext, dtype: int64

In [64]:
df_missing.to_csv("./reports/BPL/missing.csv", index=False)

In [65]:
df_missing["overdriveNo"].to_csv("./reports/BPL/missing-overdriveNos.csv", index=False)

## Expired e-content in Sierra / no access / left anti join

In [78]:
# do not include bibs without proper overdrive #

In [81]:
sdf_with_overdriveNo = sdf_raw[(sdf_raw["overdriveNo"].notnull())|(sdf_raw["overdriveNo"].str.match("(.*-.*-.*-.*)"))]

In [82]:
sdf_with_overdriveNo.shape

(203539, 7)

In [83]:
sdf_with_overdriveNo.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status
0,27C3C666-343B-442B-A81C-2068E6D1A664,37,b112402306,ocm54908102,OCoLC,x,-
1,C1AC2A72-58D4-4611-9F80-00CFE226CF2C,37,b112402318,ocm55680353,OCoLC,x,-
2,8CD53ED9-CEBD-4F78-8BEF-20A58F6F3857,37,b11240232x,ocm56833247,OCoLC,x,-
3,CAC65044-3B9C-4B12-B80B-00D9A09ABAB8,37,b112402331,ocm57443259,OCoLC,x,-
4,3D783A28-2DA9-4798-A3C2-1F65D2208F38,37,b112402343,ocm60397130,OCoLC,x,-


In [86]:
df_left_join = pd.merge(sdf_with_overdriveNo, odf_raw, how="outer", on="overdriveNo", indicator=True)

In [87]:
df_left_join.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
0,27C3C666-343B-442B-A81C-2068E6D1A664,37,b112402306,ocm54908102,OCoLC,x,-,ODN0000052705,x,both
1,C1AC2A72-58D4-4611-9F80-00CFE226CF2C,37,b112402318,ocm55680353,OCoLC,x,-,ODN0000055232,x,both
2,8CD53ED9-CEBD-4F78-8BEF-20A58F6F3857,37,b11240232x,ocm56833247,OCoLC,x,-,ODN0000060618,x,both
3,CAC65044-3B9C-4B12-B80B-00D9A09ABAB8,37,b112402331,ocm57443259,OCoLC,x,-,ODN0000062533,x,both
4,3D783A28-2DA9-4798-A3C2-1F65D2208F38,37,b112402343,ocm60397130,OCoLC,x,-,ODN0000069862,x,both


In [88]:
df_expired = df_left_join[df_left_join["_merge"] == "left_only"]

In [89]:
df_expired.head()

Unnamed: 0,overdriveNo,overdriveNoSrc,bibNo,sierraControlNo,sierraControlNoSrc,sierraFormat_int,sierra_status,overdriveControlNo,sierraFormat_ext,_merge
29,07F268B6-EF5B-4953-BBB7-773FEE0C8176,url,b112411277,ocm57506094,OCoLC,z,-,,,left_only
33,AC7F7BAB-FB41-4C1B-8342-852CA2D05309,037,b112411307,ocm57675482,OCoLC,z,-,,,left_only
37,72023498-8893-464A-B55A-F7CBF0917F22,url,b112411332,ocm57687593,OCoLC,z,-,,,left_only
39,D80763AA-CA4F-42BD-85DA-EB0E65552533,url,b112411356,ocm57696112,OCoLC,z,-,,,left_only
42,1A5B49B9-AFF3-4832-B0DD-82D3235971A8,url,b11241137x,ocm57696438,OCoLC,z,-,,,left_only


In [90]:
df_expired.shape

(14488, 10)

In [91]:
df_expired.to_csv("./reports/BPL/expired-alldata.csv", index=False)

In [None]:
df_expired[["ovedriveNo", "bibNo"]].to_csv("./reports/BPL/expired-")