In [1]:
import pandas as pd

raw_df = pd.read_csv("input.csv")
expected_df = pd.read_csv("expected.csv")

In [2]:
raw_df.columns.to_list()

['holdings_id',
 'instance_created_date',
 'instance_id',
 'instance_primary_contributor',
 'instance_title',
 'instance_updated_date',
 'item_barcode',
 'item_level_call_number',
 'item_level_call_number_typeid',
 'item_level_call_number_type_name',
 'item_copy_number',
 'item_created_date',
 'item_effective_call_number',
 'item_effective_call_number_typeid',
 'item_effective_call_number_type_name',
 'item_effective_library_code',
 'item_effective_library_id',
 'item_effective_library_name',
 'item_effective_location_id',
 'item_effective_location_name',
 'item_hrid',
 'id',
 'item_material_type_id',
 'item_material_type',
 'item_permanent_location_id',
 'item_permanent_location_name',
 'item_statistical_code_ids',
 'item_statistical_codes',
 'item_status',
 'item_temporary_location_id',
 'item_temporary_location_name',
 'item_updated_date']

In [3]:
raw_call_numbers = raw_df["item_effective_call_number"]
expected_call_numbers = expected_df["item_effective_call_number"]

In [4]:
sorted_call_numbers = raw_call_numbers.sort_values().reset_index(drop=True)
expected_call_numbers_copy = expected_call_numbers.reset_index(drop=True)

my_sort = pd.DataFrame({
    "sorted_call_numbers": sorted_call_numbers,
    "expected_call_numbers": expected_call_numbers_copy
})


The raw call numbers need cleaning. When comparing values, some have missing decimal places, like this:

my_sort

| Index | Raw Call Number         | Expected Call Number    |
|-------|-------------------------|-------------------------|
| 0     | 137 529 THS, 1          | 137.529 THS, 1         |
| 1     | 138 093 THS, 1          | 138.093 THS, 1         |

and some values are sorted correctly because the sort is done as a string comparison instead of the LoC way

| Index | Raw Call Number              | Expected Call Number        |
|-------|------------------------------|-----------------------------|
| 9     | B105.A8 E24 2004 VideoDVD, 1 | B72.G73 A7 2008 VideoDVD, 1 |
| 10    | B105.W6 F465 2009 VideoDVD, 1 | B105.A8 E24 2004 VideoDVD, 1 |
| 11    | B105.W6 F465 2009 VideoDVD, 1 | B105.W6 F465 2009 VideoDVD, 1 |
| 12    | B105.W6 F465 2009 VideoDVD, 1 | B105.W6 F465 2009 VideoDVD, 1 |
| 13    | B105.W6 F465 2009 VideoDVD, 1 | B105.W6 F465 2009 VideoDVD, 1 |
| 14    | B105.W6 F465 2009 VideoDVD, 1 | B105.W6 F465 2009 VideoDVD, 1 |
| 15    | B105.W6 F465 2009 VideoDVD, 1 | B105.W6 F465 2009 VideoDVD, 1 |
| 16    | B108 .R66 2002 VideoDVD, 1   | B105.W6 F465 2009 VideoDVD, 1 |
| 17    | B108 .R66 2002 VideoDVD, 1   | B108.R66 2002 VideoDVD, 1   |



In [None]:
import re

def normalize_call_number(call_number):
    # Remove space between opening letters and numbers (e.g., "PR 1109" → "PR1109")
    call_number = re.sub(r'([A-Z]+) (\d+)', r'\1\2', call_number)

    # Remove space before the first period (e.g., "PR1109 .A2" → "PR1109.A2")
    call_number = re.sub(r'(\d) \.', r'\1.', call_number)

    return call_number

normalized_raw_callnumbers = raw_call_numbers.apply(normalize_call_number)

normalization_compare = pd.DataFrame({
    "raw": raw_call_numbers,
    "normalized": normalized_raw_callnumbers
})

normalization_compare = normalization_compare[normalization_compare["raw"] != normalization_compare["normalized"]]

normalization_compare.reset_index(drop=True, inplace=True)

In [None]:
import re

def fix_old_call_number_format(call_number):
    # Remove space between opening letters and numbers (e.g., "PR 1109" → "PR1109")
    call_number = re.sub(r'([A-Z]+) (\d+)', r'\1\2', call_number)

    # Remove space before the first period (e.g., "PR1109 .A2" → "PR1109.A2")
    call_number = re.sub(r'(\d) \.', r'\1.', call_number)

    return call_number

# Example usage
raw_call_number = "PN1995.9.C47 T8 2005 VideoDVD, 1"
fixed_call_number = fix_old_call_number_format(raw_call_number)

print(fixed_call_number)  # Expected: "PR1109.A2 2006"


PN1995.9.C47 T8 2005 VideoDVD, 1


Post Cleaning, lets sort with an LoC library

In [None]:
import re
import pandas as pd

def loc_sort_key(call_number):
    """
    Generate a key for sorting LoC call numbers.
    - Splits letters, numbers, and decimals correctly.
    - Ensures proper numeric sorting.
    """
    parts = re.findall(r'[A-Za-z]+|\d+\.\d+|\d+', call_number)
    return [(part if part.isalpha() else float(part)) for part in parts]

# Sample DataFrame
normalization_compare = pd.DataFrame({
    "raw": ["PR 1109 .A2 2006", "PN 1995.9 .C47 T8 2005", "AE 17 .Z4672 2009"],
    "normalized": ["PR1109.A2 2006", "PN1995.9.C47 T8 2005", "AE17.Z4672 2009"]
})

# Sort based on the normalized LoC call numbers
normalization_compare = normalization_compare.sort_values(by="normalized", key=lambda x: x.map(loc_sort_key))

# Reset index for clean display
normalization_compare.reset_index(drop=True, inplace=True)

TypeError: '<' not supported between instances of 'float' and 'str'