James Caldwell, UVA IRA, Feb 2026 <br>
This script loads 3 excel files from qlik (enrollment, admissions, and degrees) and calculates the various groupings for the College Graduate Survey. There are ~700 line items calculated.

In [None]:
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os

template = pd.read_csv('CGS-survey_template-2025.csv',
                       encoding="cp1252",
                       skiprows=1)

# Load environment variables
load_dotenv()
enrollment_file_path = os.getenv('enrollment') 
enrollment = pd.read_excel(enrollment_file_path)
degrees_file_path = os.getenv('degrees') 
degrees = pd.read_excel(degrees_file_path)
admissions_file_path = os.getenv('admissions') 
admissions = pd.read_excel(admissions_file_path)

# CGS requires us to count degree for each awarded, but admissions/enrollment they want only the highest level reported.

### Process degrees data, sorting then dropping duplicates on SSID
degree_order = ['DOCT', 'MAST', 'EDS', 'CERT',"SP"]
# Convert the column to a Categorical with that order
degrees["Degree Level"] = pd.Categorical(degrees["Degree Level"], categories=degree_order, ordered=True)
# Now sort by SSID first, then by the categorical degree
degrees.sort_values(by=['Student System ID', "Degree Level"], 
                    ascending=[True, True],
                    inplace=True)
# degrees.drop_duplicates(subset=['Student System ID',"Degree Level"], keep='first', inplace=True)
degrees.drop_duplicates(subset=['Student System ID',"Degree Level"], 
                        keep='first', 
                        inplace=True)

enrollment.rename(columns={"Primary Degree Level": "Degree Level"}, inplace=True)
# Convert the column to a Categorical with that order
enrollment["Degree Level"] = pd.Categorical(enrollment["Degree Level"], categories=degree_order, ordered=True)
# Now sort by SSID first, then by the categorical degree
enrollment.sort_values(by=['Student System ID', "Degree Level"], 
                       ascending=[True, True],
                       inplace=True)
enrollment.drop_duplicates(subset=['Student System ID'], keep='first', inplace=True)

### Process admissions data, sorting then dropping duplicates on SSID
# Convert the column to a Categorical with that order
admissions["Degree Level"] = pd.Categorical(admissions["Degree Level"], categories=degree_order, ordered=True)
admissions["Enrollment Flag"] = pd.Categorical(admissions["Enrollment Flag"], categories=["Y", "N"], ordered=True)
admissions["OfferFlag"] = pd.Categorical(admissions["OfferFlag"], categories=[1, 0], ordered=True)
admissions.sort_values(
    by=["Student System ID", "Enrollment Flag", "Degree Level", "OfferFlag"],
    ascending=[True, True, True, True],
    inplace=True)
admissions.drop_duplicates(subset=['Student System ID'], keep='first', inplace=True)


In [38]:
MASTERS_OTHER = ["MAST", "COS", "EDS", "CERT","SP"]

# CGS questions on the left, Qlik data labels on the right
GENDER_MAP = {
    "men": ["M"],
    "women": ["F"],
    "other": ["X", "U"],
    "total": ["M", "F", "X", "U"]
}

# CGS questions on the left, Qlik data labels on the right
COUNTRY_MAP = {
    
    "south korea": "korea, republic of",
    "vietnam": "viet nam",
    "russia": "russian federation",
    "iran": "iran (islamic republic of)",
    "syria": "syrian arab republic",
    "taiwan": "taiwan, republic of china",
    "laos": "lao people's democratic rep",
    "moldova": "moldova, republic of",
    "libya": "libyan arab jamahiriya",
    "palestine": "palestinian territory, occupie",
    "democratic republic of the congo": "congo, the democratic republic",
    "turkey/türkiye": "turkey"
}


In [None]:
def count_from_question(question, df):
    sub = df.copy()
    question = question.strip().lower().replace("–", "-").replace("-", "-")
    question = question.lower().replace("’", "'")
    # print(question)

    # --- First-time vs Total ---
    if "first-time" in question:
        sub = sub[sub["New/Returning/Transfer Status"] == "New"]
    
    # --- Degree level ---
    if "master" in question and "doct" in question:
        pass # keep both
    else:
        if "master" in question:
            if "other" in question:
                sub = sub[sub["Degree Level"].isin(MASTERS_OTHER)]
            else:
                sub = sub[sub["Degree Level"] == "MAST"]
        if "doct" in question:
            sub = sub[sub["Degree Level"] == "DOCT"]
        if "certificate" in question:
            sub = sub[sub["Degree Level"] == "CERT"]

    # --- Full-time / Part-time ---
    if "full-time" in question and "part-time" in question:
        pass # keep both
    else:
        if "full-time" in question:
            sub = sub[sub["Full or Part Time Status"] == "Full-Time"]
        if "part-time" in question:
            sub = sub[sub["Full or Part Time Status"] == "Part-Time"]
    
    # --- Gender ---
    gender_question = question.replace("& other", "") # This logic captures "other" gender, but not "other" from "Master's & other"
    gender_question = gender_question.replace("master's and other", "master's")

    for label, codes in GENDER_MAP.items():
        if f" {label}" in gender_question:
            # print(label)
            sub = sub[sub["Gender"].isin(codes)]

    # --- Ethnicity ---
    if "hispanic" in question:
        sub = sub[sub["IPEDS Race"] == "Hispanic"]
    elif "american indian" in question:
        sub = sub[sub["IPEDS Race"] == "American Indian or Alaska Native"]
    elif "asian" in question:
        sub = sub[sub["IPEDS Race"] == "Asian"]
    elif "black" in question:
        sub = sub[sub["IPEDS Race"] == "Black or African American"]
    elif "hawaiian" in question:
        sub = sub[sub["IPEDS Race"] == "Native Hawaiian or Other Pacific Islander"]
    elif "white" in question:
        sub = sub[sub["IPEDS Race"] == "White"]
    elif "two or more" in question:
        sub = sub[sub["IPEDS Race"] == "Multi-Race"]
    elif "international" in question:
        sub = sub[sub["IPEDS Race"] == "Non-Resident Alien"]
    elif "unknown" in question:
        sub = sub[sub["IPEDS Race"] == "Race and Ethnicity Unknown"]
    elif "citizens and permanent" in question:
        sub = sub[sub["IPEDS Race"] != "Non-Resident Alien"]
    
    # --- Country-based questions ---
    parts = [p.strip() for p in question.split(",")]
    if len(parts) > 2: # 3 part questions are country-based

        if "countries not listed" in question:
            return 0
        possible_country = parts[1] #countries are always the second part of the question
        country_series = sub["Foreign Citizen Country Desc"].str.strip().str.lower()
        # print('country series unique values: ' + str(country_series.unique()))
        possible_country_norm = possible_country.strip().lower()
        possible_country_norm = COUNTRY_MAP.get(
            possible_country_norm,
            possible_country_norm
        )

        if possible_country_norm in country_series.unique():
            sub = sub.loc[country_series == possible_country_norm]
        else:
            if "total international students" in question:
                pass # the international student totals are 3 part questions too but don't need to be filtered/flagged here
            else:
                if ('saudi arabia' in question or 'indonesia' in question):
                    return 0 # I only want to put zeros if i know for sure there are none. There were none for a few from saudi in f'25.
                else:
                    # print('skipped: ' + question)
                    return '-'

    # --- Admissions ---
    if 'offers' in question:
        sub = sub[sub["OfferFlag"] == 1]
    if 'not accepted' in question:
        sub = sub[sub["OfferFlag"] == 0]

    # Skipping these as we do not track for graduate admissions
    if 'rural' in question or 'disability' in question or 'veteran' in question or 'first-generation' in question:
        return ''

    if df.equals(sub):
        if "total total" in question or "total applications, total" in question:
            pass  # allow totals to be unfiltered
        else:
            # print('skipped: ' + question)
            return '-'
    if sub.empty:
        return 0

    return sub["Student System ID"].nunique()
    # return sub[["Student System ID", "Degree Level"]].drop_duplicates().shape[0]

results = {} 
# ensure row 3 exists
template.loc[3] = -1

for col, question in template.iloc[2, :].items():
    if isinstance(question, str) and "Enrollment" in question:
        template.loc[3, col] = count_from_question(question, enrollment)
    elif isinstance(question, str) and ("Degree" in question or "Degrees" in question or "Certificates" in question):
        template.loc[3, col] = count_from_question(question, degrees)
    elif isinstance(question, str) and ("Admission" in question or "Applications" in question):
        template.loc[3, col] = count_from_question(question, admissions)
    else: # these should be enrollment section IV and V
        template.loc[3, col] = count_from_question(question, enrollment)  

template_long = template.T

# template_long.to_csv('CGS-survey-2025-v2_long.csv', header=False)
template.to_csv('CGS-survey-2025_UVA_Submission_v3.csv', index=False, header=False)