In [1]:
import pandas as pd
import numpy as np

# Import dfs
patron_df = pd.read_excel('Data\\Raw_Data\\Patron_Checkouts.xlsx')
inventory_df = pd.read_csv('Data\\Raw_Data\\Collection_List.csv', encoding = 'unicode_escape')

In [2]:
# Drop all unnecessary columns
patron_df = patron_df.drop(['Holdings Barcode', 'Unnamed: 5', 'Unnamed: 6'], axis = 1)
inventory_df = inventory_df.drop(['Media_Type', 'Identity', 'Count'], axis = 1)

# Rename Patron Columns
patron_df.rename(columns = {'Patron Barcode': 'Patron_Barcode', 'Author\'s Name': 'Author', 'Date of Action': 'Date'}, inplace = True)

In [3]:
# Split the first and last name on the ,
inventory_df[['Author_Last', 'Author_First']] = inventory_df['Author'].str.split(',', expand = True).drop([2, 3, 4, 5, 6], axis = 1)
inventory_df = inventory_df.drop('Author', axis = 1)

In [4]:
# Split the first and last name on the ,
patron_df[['Author_Last', 'Author_First']] = patron_df['Author'].str.split(',', expand = True).drop([2, 3, 4, 5], axis = 1)
patron_df = patron_df.drop('Author', axis = 1)

In [5]:
# List of columns
cols = ['Title', 'Author_First', 'Author_Last']

# Remove all punctuation and turn everything into lower case
for col in cols:
    patron_df[col] = patron_df[col].str.replace('[^\w\s]','', regex = True)
    patron_df[col] = patron_df[col].str.lower()
    inventory_df[col] = inventory_df[col].str.replace('[^\w\s]','', regex = True)
    inventory_df[col] = inventory_df[col].str.lower()

In [6]:
# Remove all duplicated title from the inventory. Keep the rows with the least amount of null values
inventory_df = inventory_df[~inventory_df['Title'].duplicated(keep=False) | inventory_df[['Author_First','Author_Last']].notnull().any(axis=1)]

In [7]:
# Remove all duplicates
inventory_df.drop_duplicates(keep = 'first', inplace = True)
patron_df.drop_duplicates(keep = 'first', inplace = True)

# Drop all rows with null values
inventory_df.dropna(inplace = True, subset = ['Title'])
patron_df.dropna(inplace = True, subset = ['Patron_Barcode', 'Title', 'Date'])

# Fill empty Author names in patron_df from inventory_df
patron_df.fillna(inventory_df, inplace = True)

In [8]:
# Create a series of all inventory titles
inventory = inventory_df['Title']

# Remove any entrys that are not in the inventory
patron_df = patron_df[patron_df['Title'].isin(inventory)]

In [9]:
print(len(inventory_df))
print(len(patron_df))

25387
77674


In [10]:
# Export dfs as csv 
inventory_df.to_csv('C:/Users/Ben/Desktop/Clean_Inventory.csv', index = False)
patron_df.to_csv('C:/Users/Ben/Desktop/Clean_Patrons.csv', index = False)