James Caldwell <br>
UVA IRA, 10/8/2025 <br>

This Python script automates the extraction of error tables from the SCHEV Institutional Portal and merges them with a VCSIN-to-SSID mapping file. The final result is exported to an Excel workbook with each error code on a separate sheet.

Features: <br>
Opens the SCHEV portal in a Chrome browser via Selenium.

Allows manual login for secure access.

Extracts links corresponding to error codes.

Follows each link to retrieve error tables.

Merges error tables with a local VCSIN-to-SSID Excel mapping.

Saves all merged tables to a single Excel workbook with meaningful sheet names.

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
from io import StringIO
import re
from dotenv import load_dotenv
import os

# Variables to change
url = "https://portals.schev.edu/institutions/PUBLIC/UVA/Viewerrorsummary.asp?tablename=fa&repyear=2425"
load_dotenv()
VCSIN_to_SSID_path = os.getenv('VCSIN_to_SSID_path') 
chrome_driver_path = os.getenv('chrome_driver_path') # This won't run on the V: drive. Put in folder on personal computer. Download from: https://googlechromelabs.github.io/chrome-for-testing/
excel_output_path = os.getenv('excel_output_path') 
meta_data_path = os.getenv('meta_data_path')

service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)#, options=options)

# Open the login page
driver.get(url)

print("Browser opened. Please log in manually.")
version_number = input("Type input number and ENTER here *after* you have logged in successfully...")

# After you log in, grab the page
driver.get(url)
html = driver.page_source
mysoup = BeautifulSoup(html, 'html.parser')

# Load VCSIN to SSID mapping file
print("Loading VCSIN to SSID mapping file...")
VCSIN_to_SSID = pd.read_excel(VCSIN_to_SSID_path)

# Load meta data file (This has error descriptions and links)
print("Loading meta data file...")
meta_data = pd.read_excel(meta_data_path)

# === Extract all links ===
links = []
for a in mysoup.find_all('a', href=True):
    href = a['href']
    # Skip empty, anchor, or javascript links
    if href.startswith('#') or href.lower().startswith('javascript'):
        continue
    # Convert relative URLs to absolute
    full_link = href if href.startswith('http') else driver.current_url.rsplit('/', 1)[0] + '/' + href
    if 'viewError' in full_link:  # filter for relevant links
        links.append(full_link)

print(f"\nFound {len(links)} error code links:")

all_error_tables_for_excel = []
all_error_table_names_for_excel = []
# # === Visit each link ===
for i, link in enumerate(links, start=1): # links[:3]
    print(f"\nVisiting link {i}/{len(links)}: {link}")
    try:
        driver.get(link)
        time.sleep(1)  # wait for page to load; adjust as needed
        
        page_html = driver.page_source
        page_soup = BeautifulSoup(page_html, 'html.parser')

        # Find the <script> tag containing the specific substring for error table url
        link_tag = page_soup.find(src=lambda value: value and "ErrorsList.asp" in value)
        if link_tag:
            
            ## save Error Name for excel sheet name
            match = re.search(r'Errcode=([^&"]+)', page_html)
            if match:
                error_code = match.group(1)
                print(f'Error Code: {error_code}')
                all_error_table_names_for_excel.append(error_code)  # error_code = match.group(1)

            ## Follow error table URL and save
            relative_url = link_tag['src']
            # print("Relative URL:", relative_url)

            # Convert to absolute
            base = "https://portals.schev.edu/institutions/PUBLIC/UVA/"
            table_url = urljoin(base, relative_url)
            # print("Full URL:", table_url)

            driver.get(table_url)
            time.sleep(1) # wait for page to load; adjust as needed
            table_page_html = driver.page_source

            # Parse tables with pandas
            tables = pd.read_html(StringIO(table_page_html), header=0)
            df = tables[0]  # Get the first table, adjust index if needed

            # sometimes it's socsec1, sometimes SOCSEC1. Capitalize it if needed.
            df.columns = [str(c).upper() if str(c).lower() == 'socsec1' else c for c in df.columns]

            if 'SOCSEC1' in df.columns:
                df = df.merge(
                    VCSIN_to_SSID,
                    how='left',
                    left_on='SOCSEC1',
                    right_on='VCSIN'
                ).drop(columns='VCSIN')

            df[f'Comments V{version_number}'] = ''
            all_error_tables_for_excel.append(df)

        else:
            print("No match found.")
    except Exception as e:
        print(f"Error visiting {link}: {e}")

print("\nDone visiting all links.")

print("\nSaving to Excel...")
with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
    for i, df in enumerate(all_error_tables_for_excel):
        sheet_name = all_error_table_names_for_excel[i]
        
        # Write each DataFrame to a separate sheet
        df.to_excel(writer, sheet_name=sheet_name, index=False,startrow=2)
        
        # Put description and link in the first two rows
        try:
            description = meta_data.loc[meta_data['ErrCode'] == sheet_name]['Description'].values[0]
            link = meta_data.loc[meta_data['ErrCode'] == sheet_name]['Link'].values[0]
            worksheet = writer.sheets[sheet_name]
            worksheet.cell(row=1, column=1).value = f"Description: {description}"
            worksheet.cell(row=2, column=3).value = f"Link: {link}"
        except Exception as e:
            print(f"Error adding metadata for sheet {sheet_name}: {e}")
            # This section will probably not fail this year (2025), but could fail if new errors are seen next year that aren't in the meta data file. Add them and re-run.
print("\nDone.")
driver.quit()

Browser opened. Please log in manually.
Loading VCSIN to SSID mapping file...
Loading meta data file...

Found 52 error code links:

Visiting link 1/52: https://portals.schev.edu/institutions/PUBLIC/UVA/viewErrors.asp?Err=AFA004W01&ErrType=W&TableName=fa&Repyear=2425&Position=68
Error Code: AFA004W01

Visiting link 2/52: https://portals.schev.edu/institutions/PUBLIC/UVA/viewErrors.asp?Err=AFA008W01&ErrType=W&TableName=fa&Repyear=2425&Position=74
Error Code: AFA008W01

Visiting link 3/52: https://portals.schev.edu/institutions/PUBLIC/UVA/viewErrors.asp?Err=AFE005W02&ErrType=W&TableName=fa&Repyear=2425&Position=151
Error Code: AFE005W02

Visiting link 4/52: https://portals.schev.edu/institutions/PUBLIC/UVA/viewErrors.asp?Err=AFE010E02&ErrType=E&TableName=fa&Repyear=2425&Position=179
Error Code: AFE010E02

Visiting link 5/52: https://portals.schev.edu/institutions/PUBLIC/UVA/viewErrors.asp?Err=AIN007W02&ErrType=W&TableName=fa&Repyear=2425&Position=319
Error Code: AIN007W02

Visiting link 

In [None]:
# Save to excel section if need to run again due to error 
# excel_output_path = r"X:\SCHEV\2425 (2024-2025)\Error & Warning Reports\Download and Iteration Check scripts\output.xlsx"
# print("\nSaving to Excel...")
# with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
#     for i, df in enumerate(all_error_tables_for_excel):
#         sheet_name = all_error_table_names_for_excel[i]
        
#         # Write each DataFrame to a separate sheet
#         df.to_excel(writer, sheet_name=sheet_name, index=False,startrow=2)
        
#         # Put description and link in the first two rows
#         try:
#             description = meta_data.loc[meta_data['ErrCode'] == sheet_name]['Description'].values[0]
#             link = meta_data.loc[meta_data['ErrCode'] == sheet_name]['Link'].values[0]
#             worksheet = writer.sheets[sheet_name]
#             worksheet.cell(row=1, column=1).value = f"Description: {description}"
#             worksheet.cell(row=2, column=2).value = f"Link: {link}"
#         except Exception as e:
#             print(f"Error adding metadata for sheet {sheet_name}: {e}")
#             # This section will probably not fail this year (2025), but could fail if new errors are seen next year that aren't in the meta data file. Add them and re-run.