James Caldwell <br>
UVA IRA <br>
FVTGE FA Item Type Mapping Review <br>
Aug 2025 <br>

Steps: <br>
Import both excel files, remove rows with ??? from Aug25 map, vertically concatenate the data frames, remove whitespaces from data, drop duplicate rows, create Expanded - Keyword 2 flag, save to excel

In [None]:
import pandas as pd
import numpy as np

# 1. Import data
Aug25_map = pd.read_excel(r'\...item_type_summary_2025_08_20.xlsx', sheet_name='Sheet1')
original_mapping = pd.read_excel(r'\...item_detail_fees_charges_item_types.xlsx', sheet_name='Sheet1')

# 2. Clean data
Aug25_map['2025 Status'] = 'New Item Type'
original_mapping['2025 Status'] = 'Existing Item Type'
Aug25_map = Aug25_map[Aug25_map['Include: Tuition & Fees'] != '???']

# 3. Stack both DataFrames
combined = pd.concat([original_mapping, Aug25_map], ignore_index=True)

# 4. Strip whitespace from string columns
for col in combined.columns:
    if combined[col].dtype == object:  # string columns
        combined[col] = combined[col].str.strip()

# 5. Drop rows that appear in both, ignoring 2025 Status column (keeping the first occurrence, from the original mapping)
exclusive = combined.drop_duplicates(
    subset=[col for col in combined.columns if col != "2025 Status"],
    keep='first'
)
exclusive = exclusive.sort_values(by='Item Type').reset_index(drop=True)


# 6. Create flag for "Expanded - Keyword 2", if two rows are identical except for "Item Type Keyword 2"
cols_to_check = [col for col in exclusive.columns if col not in ["Item Type Keyword 2", "2025 Status"]]
    # Create a boolean mask for duplicated rows (excluding the first occurrence)
mask = exclusive.duplicated(subset=cols_to_check, keep='first')
    # Assign "Expanded - Keyword 2" only to those rows
exclusive.loc[mask, "2025 Status"] = "Expanded - Keyword 2"

# 7. If two rows are identical except for "Include: Tuition & Fees" where one is empty, flag the one with non-empty "Include: Tuition & Fees"
    # The rows with blanks were then manually deleted in excel after the script was run
    # Columns to check for duplication
cols_to_check = [col for col in exclusive.columns if col not in ["2025 Status", "Include: Tuition & Fees"]]
    # Create helper column
exclusive['Include: Tuition and Fees_filled'] = exclusive['Include: Tuition & Fees'].notna() & (exclusive['Include: Tuition & Fees'] != '')
    # Sort descending so that non-blank comes first
exclusive = exclusive.sort_values(by=['Include: Tuition and Fees_filled'], ascending=True)
    # Create a boolean mask for duplicated rows (excluding the first occurrence)
mask = exclusive.duplicated(subset=cols_to_check, keep='first')
    # Assign "Expanded - Keyword 2" only to those duplicate rows
exclusive.loc[mask, "2025 Status"] = "Updated Include: Tuition & Fees (previously blank)"
    # Drop helper column
exclusive.drop(columns=['Include: Tuition and Fees_filled'], inplace=True)
exclusive = exclusive.sort_values(by='Item Type').reset_index(drop=True)

# 8. Save to excel
output_path = r'...\item_type_summary_2025_08_20_combined2.xlsx'
exclusive.to_excel(output_path, index=False)


In [4]:
exclusive[
    exclusive['Item Type'].isin([
        101316000015
    ])
]


Unnamed: 0,Item Type,Item Type Desc,Item Type Keyword 1,Item Type Keyword 2,Item Type Keyword 3,Include: Tuition & Fees,2025 Status
112,101316000015,Tuition-Arts&Sciences Non-VA,TUITION,...,...,True,Existing Item Type
113,101316000015,Tuition-Arts&Sciences Non-VA,TUITION,TUITUGDIFF,...,True,Expanded - Keyword 2
