In [1]:
import pandas as pd

In [2]:
from datetime import datetime, timedelta

## Ingest export from ServiceNow

In [3]:
src_file = input()

 src_data/u_bkop_incident_230603.csv


In [5]:
df = pd.read_csv(
    src_file,
    parse_dates=["created", "resolved"],
    low_memory=False
)

In [6]:
df.shape[0]

121479

In [10]:
assert df.columns.tolist() == [
    "number", "created", "caller", "state", "category", "subcategory",
    "assignment_group", "bpl_location", "nyp_location", "resolved",
    "assigned_to", "system", "mat_source", "close_code", "url",
    "reassignment_count", "updates", "resolved_by"]

In [11]:
df["category"].unique()

array(['Cataloging', 'Acquisitions', 'Selection', 'Logistics',
       'Offsite - ReCAP', 'Collections Processing', nan], dtype=object)

### Narrow down to Cataloging tickets: orginated as CAT or assigned to CAT

In [13]:
cdf = df[(df["category"] == "Cataloging")|(df["assignment_group"] == "BKOPS CAT")]

In [14]:
cdf.shape[0]

21250

## CAT tickets volumne by year

In [15]:
gyear_cdf = cdf.groupby(cdf["created"].map(lambda x: x.year))

In [16]:
ycdf = pd.DataFrame(columns=["year", "tickets"])

In [17]:
for year, d in gyear_cdf:
    new_row = pd.Series({"year": year, "tickets": d.shape[0]})
    ycdf = pd.concat([ycdf, new_row.to_frame().T], ignore_index=True)

In [18]:
ycdf

Unnamed: 0,year,tickets
0,2013,1311
1,2014,2140
2,2015,2739
3,2016,2554
4,2017,2233
5,2018,2112
6,2019,1785
7,2020,955
8,2021,1964
9,2022,2169


In [19]:
ycdf.to_csv("public_data/cat-by-year.csv", index=False)

### Expected volume in the most recent year

In [None]:
## assume the last row in cdf shows cutoff date for the analysis

In [20]:
last_day = cdf.iloc[-1]["created"]

In [38]:
days_to_date = (last_day - datetime(last_day.year, 1, 1)).days  # warning, there must be enough to date data for such prediction to make sense

In [39]:
days_to_go = (datetime(last_day.year, 12, 31) - last_day).days

In [40]:
# calculate expected volume proportionally

In [49]:
tickets_to_date = ycdf.iloc[-1]["tickets"]

In [50]:
expected_current_year_volume = round((days_to_go * tickets_to_date) / days_to_date) + tickets_to_date

In [52]:
print(f"Expected volume in {last_day.year}: {expected_current_year_volume}")

expected volume in 2023: 3056


### Volume breakdown by system

In [53]:
ycdf_rl = pd.DataFrame(columns=["system", "year", "tickets"])
ycdf_bl = pd.DataFrame(columns=["system", "year", "tickets"])
ycdf_bp = pd.DataFrame(columns=["system", "year", "tickets"])

In [54]:
for year, d in gyear_cdf:
    rl_df = d[d["system"] == "NYPL Research"]
    bl_df = d[d["system"] == "NYPL Circulating"]
    bp_df = d[d["system"] == "BPL Circulating"]
    new_rl = pd.Series({"system": "NYPL RL", "year": year, "tickets" : rl_df.shape[0]})
    new_bl = pd.Series({"system": "NYPL BL", "year": year, "tickets": bl_df.shape[0]})
    new_bp = pd.Series({"system": "BPL", "year": year, "tickets": bp_df.shape[0]})
    ycdf_rl = pd.concat([ycdf_rl, new_rl.to_frame().T], ignore_index=True)
    ycdf_bl = pd.concat([ycdf_bl, new_bl.to_frame().T], ignore_index=True)
    ycdf_bp = pd.concat([ycdf_bp, new_bp.to_frame().T], ignore_index=True)

In [55]:
ycdf_rl.to_csv("public_data/rl-by-year.csv", index=False)
ycdf_bl.to_csv("public_data/bl-by-year.csv", index=False)
ycdf_bp.to_csv("public_data/bp-by-year.csv", index=False)

#### Expected volume by system

In [60]:
rl_tickets_to_date = ycdf_rl.iloc[-1]["tickets"]

In [61]:
bl_tickets_to_date = ycdf_bl.iloc[-1]["tickets"]

In [62]:
bp_tickets_to_date = ycdf_bp.iloc[-1]["tickets"]

In [63]:
expected_current_year_volume_rl = round((days_to_go * rl_tickets_to_date) / days_to_date) + rl_tickets_to_date

In [65]:
expected_current_year_volume_bl = round((days_to_go * bl_tickets_to_date) / days_to_date) + bl_tickets_to_date

In [66]:
expected_current_year_volume_bp = round((days_to_go * bp_tickets_to_date) / days_to_date) + bp_tickets_to_date

In [70]:
print(f"Expected NYPL RL volume in {last_day.year}: {expected_current_year_volume_rl}")
print(f"Expected NYPL BL volume in {last_day.year}: {expected_current_year_volume_bl}")
print(f"Expected NYPL BPL volume in {last_day.year}: {expected_current_year_volume_bp}")

Expected NYPL RL volume in 2023: 235
Expected NYPL BL volume in 2023: 1599
Expected NYPL BPL volume in 2023: 1222


## Last year most frequent requesting locations

In [73]:
# treat three systems separately

In [74]:
start_date_str = input()  # enter format: YYYY-MM-DD

 2023-06-04


In [75]:
start_date = datetime.strptime(start_date_str, "%Y-%m-%d") - timedelta(365)
print(start_date)

2022-06-04 00:00:00


In [76]:
loc_columns = ["location", "tickets"]

#### NYPL Research

In [77]:
loc_rl = cdf[(cdf["system"] == "NYPL Research") & (cdf["created"] >= start_date)]

In [78]:
loc_rl.shape[0]

251

In [79]:
gloc_rl = pd.DataFrame(columns=loc_columns)

In [80]:
for loc, d in loc_rl.groupby("nyp_location"):
    new_row = pd.Series({"location": loc, "tickets": d.shape[0]})
    gloc_rl = pd.concat([gloc_rl, new_row.to_frame().T], ignore_index=True)
    print(loc, d.shape[0])

Library Services Center 8
Library for the Performing Arts 13
Schomburg Center for Research in Black Culture 35
Stavros Niarchos Foundation Library 5
Stephen A. Schwarzman Building 190


In [81]:
gloc_rl = gloc_rl.sort_values("tickets", ascending=False)

In [82]:
gloc_rl.to_csv("public_data/freq-loc-rl.csv", index=False)

#### NYPL Circ

In [83]:
loc_bl = cdf[(cdf["system"] == "NYPL Circulating") & (cdf["created"] >= start_date)]

In [84]:
loc_bl.shape[0]

1342

In [85]:
gloc_bl = pd.DataFrame(columns=loc_columns)

In [86]:
for loc, d in loc_bl.groupby("nyp_location"):
    new_row = pd.Series({"location": loc, "tickets": d.shape[0]})
    gloc_bl = pd.concat([gloc_bl, new_row.to_frame().T], ignore_index=True)

In [87]:
gloc_bl = gloc_bl.sort_values("tickets", ascending=False)

In [88]:
gloc_bl.head()

Unnamed: 0,location,tickets
14,Bronx Library Center,140
66,Stavros Niarchos Foundation Library,113
41,Library for the Performing Arts,93
67,Stephen A. Schwarzman Building,69
59,Seward Park Library,61


In [97]:
gloc_bl.tail()

Unnamed: 0,location,tickets
12,"Bookmobile, Manhattan",1
34,Huguenot Park Library,1
18,Clason's Point Library,1
22,Eastchester Library,1
0,270MAD,1


In [89]:
gloc_bl.to_csv("public_data/freq-loc-bl.csv", index=False)

#### BPL

In [90]:
loc_bp = cdf[(cdf["system"] == "BPL Circulating") & (cdf["created"] >= start_date)]

In [91]:
loc_bp.shape[0]

1035

In [92]:
gloc_bp = pd.DataFrame(columns=loc_columns)

In [93]:
for loc, d in loc_bp.groupby("bpl_location"):
    new_row = pd.Series({"location": loc, "tickets": d.shape[0]})
    gloc_bp = pd.concat([gloc_bp, new_row.to_frame().T], ignore_index=True)

In [94]:
gloc_bp = gloc_bp.sort_values("tickets", ascending=False)

In [95]:
gloc_bp.head()

Unnamed: 0,location,tickets
17,Central Literature & Languages - 14,149
6,Brooklyn Heights - 50,128
20,Central SST - 16,96
62,Windsor Terrace - 77,55
15,Central HBR (Hist/Biog/Rel) - 13,54


In [98]:
gloc_bp.tail()

Unnamed: 0,location,tickets
13,Central - Mailroom,1
57,Sunset Park - 55,1
12,Center for Brooklyn History - 91,1
22,Child's Place - 39C,1
3,BiblioBus - 64,1


In [99]:
gloc_bp.to_csv("public_data/freq-loc-bp.csv", index=False)

#### Consider for the end-of-the-year analysis to include inactive locations (0-5 tickets per year)

## Ticket Categories

In [100]:
# consider last 365 days only!

In [101]:
columns = ["category", "tickets"]

In [102]:
cat_rl = pd.DataFrame(columns=columns)

In [104]:
for cat, d in loc_rl.groupby("subcategory"):
    new_row = pd.Series({"category": cat, "tickets": d.shape[0]})
    cat_rl = pd.concat([cat_rl, new_row.to_frame().T], ignore_index=True)

In [105]:
cat_rl.to_csv("public_data/cat-pie-rl.csv", index=False)

In [106]:
cat_bl = pd.DataFrame(columns=columns)

In [107]:
for cat, d in loc_bl.groupby("subcategory"):
    new_row = pd.Series({"category": cat, "tickets": d.shape[0]})
    cat_bl = pd.concat([cat_bl, new_row.to_frame().T], ignore_index=True)

In [108]:
cat_bl.to_csv("public_data/cat-pie-bl.csv", index=False)

In [109]:
cat_bp = pd.DataFrame(columns=columns)

In [110]:
for cat, d in loc_bp.groupby("subcategory"):
    new_row = pd.Series({"category": cat, "tickets": d.shape[0]})
    cat_bp = pd.concat([cat_bp, new_row.to_frame().T], ignore_index=True)

In [111]:
cat_bp.to_csv("public_data/cat-pie-bp.csv", index=False)

### Average agent workload

In [112]:
# must split between systems
# RL - 5, BL - 6, BP - 6

In [113]:
ave_rl_tickets_per_agent = (loc_rl.shape[0] / 52.1) / 5

In [116]:
print(f"Avg. NYPL RL agent workload: {ave_rl_tickets_per_agent} in the last 365 days.")

Avg. NYPL RL agent workload: 0.963531669865643 in the last 365 days.


In [117]:
ave_bl_tickets_per_agent_per_week = (loc_bl.shape[0] / 52.1) / 6

In [118]:
print(f"Avg. NYPL BL agent workload: {ave_bl_tickets_per_agent_per_week} in the last 365 days.")

Avg. NYPL BL agent workload: 4.293026231605886 in the last 365 days.


In [119]:
ave_bp_tickets_per_agent_per_week = (loc_bp.shape[0] / 52.1) / 6

In [121]:
print(f"Avg. BPL agent workload: {ave_bp_tickets_per_agent_per_week} in the last 365 days.")

Avg. BPL agent workload: 3.310940499040307 in the last 365 days.


In [122]:
ave_circ_tickets_per_agent_per_week = ((loc_bl.shape[0] + loc_bp.shape[0]) / 52.1) / 12

In [124]:
print(f"Avg. NYPL BL + BPL agent workload: {ave_circ_tickets_per_agent_per_week} in the last 365 days.")

Avg. NYPL BL + BPL agent workload: 3.8019833653230966 in the last 365 days.


In [125]:
# before and after Jan 19, 2023

In [126]:
point_date = datetime(2023, 1, 19)
start_date = datetime(2023, 1, 19) - timedelta(365)

In [127]:
print(start_date, point_date)

2022-01-19 00:00:00 2023-01-19 00:00:00


In [128]:
before_df = cdf[(cdf["system"] != "NYPL Research") & (cdf["resolved"] >= start_date) & (cdf["resolved"] < point_date)]

In [129]:
before_df.shape[0]

1859

In [130]:
after_df = cdf[(cdf["system"] != "NYPL Research") & (cdf["resolved"] >= point_date)]

In [131]:
after_df.shape[0]

1825

In [132]:
agents_before_weekly = pd.DataFrame(columns=["agent", "tickets"]).iloc[0:0]
agents_before = pd.DataFrame(columns=["agent", "tickets"]).iloc[0:0]

In [133]:
for agent, d in before_df.groupby("resolved_by"):
    new_row_weekly = pd.Series({"agent": agent, "tickets": d.shape[0] / 52.1})
    new_row = pd.Series({"agent": agent, "tickets": d.shape[0]})
    agents_before_weekly = pd.concat([agents_before_weekly, new_row_weekly.to_frame().T], ignore_index=True)
    agents_before = pd.concat([agents_before, new_row.to_frame().T], ignore_index=True)

In [134]:
agents_before = agents_before.sort_values("tickets", ascending=False)
agents_before_weekly = agents_before_weekly.sort_values("tickets", ascending=False)

In [135]:
agents_before.to_csv("private_data/agents-circ-before.csv", index=False)
agents_before_weekly.to_csv("private_data/agents-circ-before-weekly.csv", index=False)

In [136]:
agents_after = pd.DataFrame(columns=["agent", "tickets"]).iloc[0:0]
agents_after_weekly = pd.DataFrame(columns=["agent", "tickets"]).iloc[0:0]

In [137]:
for agent, d in after_df.groupby("resolved_by"):
    new_row_weekly = pd.Series({"agent": agent, "tickets": d.shape[0] / 52.1})
    new_row = pd.Series({"agent": agent, "tickets": d.shape[0]})
    agents_after = pd.concat([agents_after, new_row.to_frame().T], ignore_index=True)
    agents_after_weekly = pd.concat([agents_after_weekly, new_row_weekly.to_frame().T], ignore_index=True)

In [138]:
agents_after = agents_after.sort_values("tickets", ascending=False)
agents_after_weekly = agents_after_weekly.sort_values("tickets", ascending=False)

In [139]:
agents_after.to_csv("private_data/agents-circ-after.csv", index=False)

In [140]:
agents_after_weekly.to_csv("private_data/agents-circ-after-weekly.csv", index=False)

### Circ agents response times before and after (equal periods)

In [141]:
relevant_days = (datetime(2023, 6, 4) - datetime(2023, 1, 1)).days

In [142]:
reorg_date = datetime(2023, 1, 19)

In [143]:
before_df = cdf[(cdf["system"] != "NYPL Research") & (cdf["created"] < datetime(2023, 1, 19)) & (cdf["created"] >= reorg_date - timedelta(relevant_days)) & (cdf["resolved"].notnull())]

In [144]:
before_df.shape[0]

827

In [145]:
before_df["resolved"].unique()

<DatetimeArray>
['2022-08-18 12:05:05', '2022-08-18 12:48:18', '2022-08-19 07:35:52',
 '2022-08-18 15:10:34', '2022-08-18 16:19:13', '2022-08-19 10:30:15',
 '2022-08-19 10:27:41', '2022-08-19 14:28:02', '2022-08-22 14:15:29',
 '2022-08-22 13:39:02',
 ...
 '2023-01-19 09:47:53', '2023-01-23 09:17:10', '2023-01-20 16:08:51',
 '2023-01-17 15:59:04', '2023-01-26 14:12:56', '2023-01-18 15:24:40',
 '2023-01-23 10:28:14', '2023-01-31 08:54:31', '2023-01-19 09:46:00',
 '2023-01-27 12:18:09']
Length: 826, dtype: datetime64[ns]

In [146]:
after_df = cdf[(cdf["system"] != "NYPL Research") & (cdf["created"] >= datetime(2023, 1, 19)) & (cdf["resolved"].notnull())]

In [147]:
after_df.shape[0]

1045

In [148]:
after_df["resolved"].unique()

<DatetimeArray>
['2023-01-19 16:52:31', '2023-01-19 13:34:33', '2023-01-23 15:35:38',
 '2023-01-23 15:41:01', '2023-01-20 11:18:22', '2023-01-20 11:04:47',
 '2023-01-20 11:17:03', '2023-01-20 10:32:29', '2023-01-20 10:18:42',
 '2023-01-23 09:00:14',
 ...
 '2023-06-01 10:10:36', '2023-06-01 09:25:47', '2023-06-01 11:52:41',
 '2023-06-02 12:18:13', '2023-06-01 15:22:57', '2023-06-02 09:56:36',
 '2023-06-02 09:58:57', '2023-06-02 10:32:44', '2023-06-02 14:52:23',
 '2023-06-02 12:54:16']
Length: 1045, dtype: datetime64[ns]

In [149]:
before_df["time2resolve"] = before_df["resolved"] - before_df["created"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  before_df["time2resolve"] = before_df["resolved"] - before_df["created"]


In [150]:
before_df["time2resolve"].describe()

count                           827
mean     33 days 06:57:33.574365175
std      50 days 13:27:01.626564466
min                 0 days 00:03:03
25%          0 days 16:19:07.500000
50%                 9 days 19:45:40
75%         44 days 09:22:31.500000
max               265 days 06:34:50
Name: time2resolve, dtype: object

In [152]:
before_df.to_csv("public_data/time2resolve-circ-before-all-points.csv", columns=["number", "time2resolve"], index=False)

In [153]:
after_df["time2resolve"] = after_df["resolved"] - after_df["created"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after_df["time2resolve"] = after_df["resolved"] - after_df["created"]


In [154]:
after_df["time2resolve"].describe()

count                         1045
mean     2 days 21:53:17.162679425
std      9 days 06:42:34.775753653
min                0 days 00:01:32
25%                0 days 02:55:04
50%                0 days 20:23:50
75%                2 days 03:20:06
max              124 days 07:48:06
Name: time2resolve, dtype: object

In [156]:
after_df.to_csv("public_data/time2resolve-circ-after-all-points.csv", columns=["number", "time2resolve"], index=False)