In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_patron_df = pd.read_excel('Data\\Raw_Data\\Patron_Checkouts.xlsx')
raw_inventory_df = pd.read_csv('Data\\Raw_Data\\Collection_List.csv', encoding = 'unicode_escape')

In [3]:
raw_patron_df.shape

(90878, 7)

In [4]:
raw_inventory_df.shape

(25871, 5)

In [5]:
# Select and remove all columns that have data in the last 2 columns
df1 = raw_patron_df[raw_patron_df['Unnamed: 5'].notnull()]
df2 = raw_patron_df[raw_patron_df['Unnamed: 6'].notnull()]

patron_df = raw_patron_df.drop(pd.concat([df1, df2]).index)

In [6]:
patron_df.shape

(90833, 7)

In [7]:
# Drop the last 2 columns
patron_df.drop(['Unnamed: 5', 'Unnamed: 6'], axis = 1, inplace = True)

In [8]:
# Rename the columns
patron_df.rename(columns = {'Patron Barcode': 'Patron_ID', 'Author\'s Name': 'Author', 'Holdings Barcode': 'Item_ID', 'Date of Action': 'Date'}, inplace = True)
patron_df.shape

(90833, 5)

In [9]:
patron_df = patron_df.dropna()
patron_df.shape

(76969, 5)

In [10]:
patron_df['Is_ILL'] = patron_df['Title'].str.contains('ILL -', na = False)

In [11]:
patron_df['Is_ILL'].value_counts()

False    75481
True      1488
Name: Is_ILL, dtype: int64

In [12]:
patron_df[['1', '2', '3', '4', '5', '6']] = patron_df['Author'].str.split(';', expand = True)
patron_df.drop(['2', '3', '4', '5', '6'], axis = 1, inplace = True)

In [13]:
patron_df[['Author_Last', 'Author_First', '1', '2', '3']] = patron_df['1'].str.split(',', expand = True)
patron_df.drop(['Author', '1', '2', '3'], axis = 1,inplace = True)

In [14]:
patron_df = patron_df.replace('&', 'and')

In [15]:
cols = ['Title', 'Author_Last', 'Author_First']
for col in cols:
    patron_df[col] = patron_df[col].str.replace('[^\w\s]','', regex = True)
    patron_df[col] = patron_df[col].str.lower()

In [16]:
patron_df.sample(10)

Unnamed: 0,Patron_ID,Title,Item_ID,Date,Is_ILL,Author_Last,Author_First
40450,13390,the berenstain bears are superbears,125182,09/21/2021 05:57:36PM,False,berenstain,mike
56397,13728,dog man mothering heights,125976,03/03/2022 01:57:12PM,False,pilkey,dav
80801,17679,celebrate your feeling the positive mindset p...,133662,06/14/2022 12:05:15PM,False,rivers,lauren
6576,11896,crocodile tears,100989,12/20/2021 03:24:12PM,False,horowitz,anthony
49649,13548,ice cream and dinosaurs,108977,04/26/2022 03:48:58PM,False,litwin,eric
48939,13540,you go first,130436,01/29/2022 12:09:41PM,False,kelly,erin entrada
61924,14006,say hello to the snowy animals,107532,07/01/2022 01:47:09PM,False,whybrow,ian
73642,17219,clifford the firehouse dog,125227,04/01/2022 12:30:56PM,False,bridwell,norman
8552,12413,little miss spider a christmas wish,134526,11/01/2021 05:04:18PM,False,kirk,david
74220,17231,ill super earth encyclopedia,90000626,08/26/2021 11:15:09AM,True,woodward,john


In [17]:
patron_df.shape

(76969, 7)

In [18]:
inventory_df = raw_inventory_df.drop(['Media_Type', 'Identity', 'Count'], axis = 1)
inventory_df.shape

(25871, 2)

In [19]:
inventory_df.dropna(inplace = True)

In [20]:
inventory_df[['1', '2', '3', '4', '5', '6']] = inventory_df['Author'].str.split(';', expand = True)
inventory_df.drop(['2', '3', '4', '5', '6'], axis = 1, inplace = True)

In [21]:
inventory_df[['Author_Last', 'Author_First', '1', '2', '3']] = inventory_df['1'].str.split(',', expand = True)
inventory_df.drop(['Author','1', '2', '3'], axis = 1,inplace = True)

In [22]:
inventory_df = inventory_df.replace('&', 'and')

In [23]:
cols = ['Title', 'Author_Last', 'Author_First']
for col in cols:
    inventory_df[col] = inventory_df[col].str.replace('[^\w\s]','', regex = True)
    inventory_df[col] = inventory_df[col].str.lower()

In [24]:
inventory_df.dropna(inplace = True)
inventory_df.shape

(24062, 3)

In [25]:
inventory_df.drop_duplicates(inplace = True)
inventory_df.shape

(23744, 3)

In [26]:
inventory_df.to_csv('C:/Users/Ben/Desktop/Clean_Inventory.csv', index = False)
patron_df.to_csv('C:/Users/Ben/Desktop/Clean_Patrons.csv', index = False)

In [37]:
# How many titles are there that correspond to at least two different books? (So you're looking for totally different books that just happen to have the same title.)
inventory_df.shape[0] - inventory_df['Title'].value_counts().shape[0]

620

In [40]:
# What proportion of titles have no author?
(90833 - 76969) / 90833

0.15263175277707441

In [71]:
90833 - 76969

13864

In [67]:
# What proportion of patrons have done ILL?
ILL_Checkouts = patron_df[['Patron_ID', 'Is_ILL']].value_counts()
ILL_Checkouts.loc[:,True]

Patron_ID
17999    123
12346     80
13647     67
13227     54
17785     44
        ... 
13863      1
16954      1
13592      1
12849      1
13025      1
Length: 146, dtype: int64

In [68]:
patron_df['Patron_ID'].nunique()

1023

In [69]:
146 / 1023

0.14271749755620725

In [43]:
# What proportion of checkouts are ILL?
patron_df['Is_ILL'].value_counts()
1488 / 75481

0.019713570302460223

In [73]:
patron_df.shape

(76969, 7)