In [29]:
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os

# print(os.getcwd())

template = pd.read_csv('CGS-survey-template-2025.csv',
                       encoding="cp1252",
                       skiprows=1)

# Load environment variables
load_dotenv()
enrollment_file_path = os.getenv('enrollment') 
enrollment = pd.read_excel(enrollment_file_path)
degrees_file_path = os.getenv('degrees') 
degrees = pd.read_excel(degrees_file_path)

# Define the custom order
degree_order = ['DOCT', 'MAST', 'EDS', 'CERT']
# Convert the column to a Categorical with that order
degrees['SCHEV CIP'] = pd.Categorical(degrees['SCHEV CIP'], categories=degree_order, ordered=True)
# Now sort by SSID first, then by the categorical degree
degrees.sort_values(by=['Student System ID', 'SCHEV CIP'], inplace=True)
degrees.drop_duplicates(subset=['Student System ID'], keep='first', inplace=True)

degrees.rename(columns={"Degree Level": "Primary Degree Level"}, inplace=True)



In [30]:
MASTERS_OTHER = ["MAST", "COS", "EDS", "CERT"]
DOCTORAL = ["DOCT"]

GENDER_MAP = {
    "men": ["M"],
    "women": ["F"],
    "other other": ["X", "U"],
    "total": ["M", "F", "X", "U"]
}

In [None]:
def count_from_question(question, df):
    sub = df.copy()
    question = question.strip().lower().replace("–", "-").replace("-", "-")
    # print(question)
    # --- First-time vs Total ---
    if "first-time" in question:
        # print("Filtering for first-time students...")
        sub = sub[sub["New/Returning/Transfer Status"] == "New"]

    # --- Degree level ---
    if "master" in question:
        sub = sub[sub["Primary Degree Level"].isin(MASTERS_OTHER)]
    elif "doctorate" in question:
        sub = sub[sub["Primary Degree Level"].isin(DOCTORAL)]

    # --- Full-time / Part-time ---
    if "full-time" in question:
        sub = sub[sub["Full or Part Time Status"] == "Full-Time"]
    elif "part-time" in question:
        sub = sub[sub["Full or Part Time Status"] == "Part-Time"]

    # --- Gender ---
    for label, codes in GENDER_MAP.items():
        if f" {label}" in question:
            # print(label)
            sub = sub[sub["Gender"].isin(codes)]

    # --- Ethnicity ---
    

    # --- Country-based questions ---
    parts = [p.strip() for p in question.split(",")]
    if len(parts) > 2: # 3 part questions are country-based
        possible_country = parts[1]

        country_series = enrollment["Foreign Citizen Country Desc"].str.strip().str.lower()
        possible_country_norm = possible_country.strip().lower()

        if possible_country_norm in country_series.unique():
            sub = sub.loc[country_series == possible_country_norm]
        else:
            print('skipped: ' + question)
            return '-'
        
    if df.equals(sub):
        print('skipped: ' + question)
        return '-'

    return sub["Student System ID"].nunique()

results = {} 
# ensure row 3 exists
template.loc[3] = -1

for col, question in template.iloc[2, :].items():
    if isinstance(question, str) and "Enrollment" in question:
        template.loc[3, col] = count_from_question(question, enrollment)
        results[question] = count_from_question(question, enrollment)
    elif isinstance(question, str) and ("Degree" in question or "Certificates" in question):
        template.loc[3, col] = count_from_question(question, degrees)
        results[question] = count_from_question(question, degrees)
    elif 0:
        print('yay')
        #do admission/applications here
    else: # these should be enrollment section IV and V
        template.loc[3, col] = count_from_question(question, enrollment)
        results[question] = count_from_question(question, enrollment)    



skipped: graduate certificates, other
skipped: graduate certificates, other
skipped: graduate certificates, total
skipped: graduate certificates, total
skipped: rural status, first-time enrollment
skipped: rural status, first-time enrollment
skipped: disability status, first-time enrollment
skipped: disability status, first-time enrollment
skipped: veteran status, first-time enrollment
skipped: veteran status, first-time enrollment
skipped: first-generation status, first-time enrollment
skipped: first-generation status, first-time enrollment
skipped: first-time enrollment, iran, master’s and other
skipped: first-time enrollment, iran, master’s and other
skipped: first-time enrollment, iran, doctorate
skipped: first-time enrollment, iran, doctorate
skipped: first-time enrollment, iran, total
skipped: first-time enrollment, iran, total
skipped: first-time enrollment, south korea, master’s and other
skipped: first-time enrollment, south korea, master’s and other
skipped: first-time enroll

In [38]:
template.shape

(4, 794)

In [42]:
len(results)

226

In [21]:
template.iloc[3,:].value_counts()

-1     580
-       46
2        8
6        8
7        7
      ... 
108      1
180      1
69       1
89       1
48       1
Name: 3, Length: 73, dtype: int64