In [17]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('display.max_rows', 25990)
pd.set_option('display.max_columns', 200)

In [19]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('display.max_rows', 25990)
pd.set_option('display.max_columns', 200)# ---  Load the both sheets ---

# Define the file path
BASE = Path("/content/drive/MyDrive/CP_UMBC/Raw_Data")
FILE = BASE / "RT2733808 Fin aid stdnt Success analysis data Sp2025.xlsx"

# Read all sheet names to confirm structure
xls = pd.ExcelFile(FILE)
print("Available sheets:", xls.sheet_names)




Available sheets: ['FA Stdnt Success ', 'AP Courses']


In [20]:
# Load both sheets
sheet2 = pd.read_excel(FILE, sheet_name=xls.sheet_names[1])



In [21]:
sheet2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58246 entries, 0 to 58245
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Employeeid                 58246 non-null  int64  
 1   MatricTermdescription      58246 non-null  object 
 2   EmployeeID                 58246 non-null  int64  
 3   MatricAcademicYear         58246 non-null  int64  
 4   MatricStatusOfficialDescr  58246 non-null  object 
 5   TEST_ID                    43542 non-null  object 
 6   TEST_COMPONENT             43542 non-null  object 
 7   EARN_CREDIT                43542 non-null  object 
 8   SCORE                      43542 non-null  float64
 9   EquivalencyCourseId        43542 non-null  float64
 10  Course                     43542 non-null  object 
 11  TestDescription            43542 non-null  object 
 12  UNT_TRNSFR                 43542 non-null  float64
dtypes: float64(3), int64(3), object(7)
memory usag

In [22]:
sheet2 = sheet2.copy()


In [23]:
# Make safe string versions but keep real NaN for rows with no data
sheet2["TEST_ID_clean"] = sheet2["TEST_ID"].astype(str).str.strip().replace({"nan": np.nan, "NaN": np.nan, "": np.nan})
sheet2["TEST_COMPONENT_clean"] = sheet2["TEST_COMPONENT"].astype(str).str.strip().replace({"nan": np.nan, "NaN": np.nan, "": np.nan})

# Combine into one code like AP_PY
sheet2["AP_Code"] = np.where(
    sheet2["TEST_ID_clean"].notna() & sheet2["TEST_COMPONENT_clean"].notna(),
    sheet2["TEST_ID_clean"] + "_" + sheet2["TEST_COMPONENT_clean"],
    np.nan
)



In [24]:
sheet2 = sheet2.drop(columns=["TEST_ID_clean", "TEST_COMPONENT_clean"])

In [25]:
# Make a cleaned version of TestDescription for mapping
sheet2["TestDescription_clean"] = (
    sheet2["TestDescription"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)  # collapse weird spaces
    .str.title()  # "PHYSICS C: MECHANICS" -> "Physics C: Mechanics"
)


In [26]:
ap_subject_map_final = {
    # Social
    "Psychology": "social",
    "Us Government & Politics": "social",
    "U.S. History": "social",
    "European History": "social",
    "World History": "social",
    "Human Geography": "social",
    "Microeconomics": "social",
    "Macroeconomics": "social",
    "Comprtve Government & Politics": "social",
    "Art: History": "social",
    "Research": "social",
    "Seminar": "social",

    # Language
    "Spanish Language": "language",
    "Spanish Literature": "language",
    "French Language": "language",
    "German Language": "language",
    "German": "language",
    "Chinese Language And Culture": "language",
    "Japanese Language And Culture": "language",
    "Italian Language & Culture": "language",
    "Russian Language & Culture": "language",
    "Latin: Vergil": "language",
    "Latin Literature": "language",

    # English
    "English Language & Composition": "english",
    "English Literature & Compostn": "english",

    # STEM (science)
    "Biology": "stem",
    "Chemistry": "stem",
    "Environmental Science": "stem",
    "Physics B": "stem",
    "Physics 1": "stem",
    "Physics 2": "stem",
    "Physics C: Mechanics": "stem",
    "Physics C: Electricity & Magt": "stem",

    # Math
    "Calculus Ab": "math",
    "Calculus Bc": "math",
    "Calculus Ab Subscore Grade": "math",
    "Statistics": "math",

    # Computer / Data
    "Computer Science A": "computer",
    "Computer Science Ab": "computer",
    "Computer Science Principles": "computer",

    # Art / Music
    "Studio Art: Drawing": "art",
    "Studio Art: 2-D Design": "art",
    "Studio Art: 3-D Design": "art",
    "Music Theory": "art",
    "Music: Aural Subscore": "art",
    "Music: Nonaural Subscore": "art",
}


In [27]:
sheet2["AP_SubjectCategory"] = sheet2["TestDescription_clean"].map(ap_subject_map_final)


In [28]:
unmapped = (
    sheet2.loc[sheet2["AP_SubjectCategory"].isna(), "TestDescription_clean"]
    .dropna()
    .drop_duplicates()
    .sort_values()
    .tolist()
)

print(" Unmapped AP subjects:")
for subj in unmapped:
    print(" -", subj)


 Unmapped AP subjects:
 - Nan


In [29]:
# add AP_SubjectCategory

#  Create the mapped category column
sheet2["AP_SubjectCategory"] = sheet2["TestDescription_clean"].map(ap_subject_map_final)

#  Standardize IDs as strings (prevents merge issues later)
sheet2["EmployeeID"] = sheet2["EmployeeID"].astype(str).str.strip()

#  Make sure  fields are numeric (keep real NaN if missing)
for c in ["SCORE", "UNT_TRNSFR"]:
    sheet2[c] = pd.to_numeric(sheet2[c], errors="coerce")

#  Quick sanity checks
print("AP_SubjectCategory value counts (top 10):")
print(sheet2["AP_SubjectCategory"].value_counts(dropna=False).head(10))

print("\nRows with any AP data (TEST_ID not null):", sheet2["TEST_ID"].notna().sum())
print("Unique students (EmployeeID):", sheet2["EmployeeID"].nunique())

#sheet2.head(3)


AP_SubjectCategory value counts (top 10):
AP_SubjectCategory
social      17408
NaN         14704
math         8838
stem         7172
english      5165
computer     3206
language     1220
art           533
Name: count, dtype: int64

Rows with any AP data (TEST_ID not null): 43542
Unique students (EmployeeID): 25926


In [30]:
# aggregate AP features per student
ap_agg = (
    sheet2.groupby("EmployeeID")
    .agg(
        AP_num_tests=("TEST_COMPONENT", "count"),                 # total AP rows
        AP_unique_codes=("AP_Code", lambda s: s.nunique(dropna=True)),
        AP_any_credit=("EARN_CREDIT", lambda s: int((s == "Y").any())),
        AP_total_transfer_credits=("UNT_TRNSFR", "sum"),
        AP_avg_score=("SCORE", "mean"),
        AP_max_score=("SCORE", "max"),
    )
    .reset_index()
)

#
for c in ["AP_num_tests", "AP_unique_codes", "AP_total_transfer_credits", "AP_any_credit"]:
    ap_agg[c] = ap_agg[c].fillna(0)
ap_agg["AP_any_credit"] = ap_agg["AP_any_credit"].astype(int)

# Quick sanity checks
print("Shape:", ap_agg.shape)
print("Sample columns:", ap_agg.columns.tolist())
print("\nSummary:")
print(ap_agg.describe().T.head(10))

#ap_agg.head(5)


Shape: (25926, 7)
Sample columns: ['EmployeeID', 'AP_num_tests', 'AP_unique_codes', 'AP_any_credit', 'AP_total_transfer_credits', 'AP_avg_score', 'AP_max_score']

Summary:
                             count      mean       std  min  25%       50%  \
AP_num_tests               25926.0  1.679472  2.606752  0.0  0.0  0.000000   
AP_unique_codes            25926.0  1.673995  2.547906  0.0  0.0  0.000000   
AP_any_credit              25926.0  0.433156  0.495521  0.0  0.0  0.000000   
AP_total_transfer_credits  25926.0  4.211872  6.936002  0.0  0.0  0.000000   
AP_avg_score               11230.0  3.497803  0.845444  1.0  3.0  3.571429   
AP_max_score               11230.0  4.081923  0.942020  1.0  4.0  4.000000   

                           75%    max  
AP_num_tests               3.0   80.0  
AP_unique_codes            3.0   18.0  
AP_any_credit              1.0    1.0  
AP_total_transfer_credits  7.0  208.0  
AP_avg_score               4.0    5.0  
AP_max_score               5.0    5.0  


In [31]:
# count AP tests by subject category (math, stem, etc.)

cat_counts = (
    sheet2.dropna(subset=["AP_SubjectCategory"])
          .pivot_table(
              index="EmployeeID",
              columns="AP_SubjectCategory",
              values="AP_Code",
              aggfunc="count",
              fill_value=0
          )
          .add_prefix("AP_ct_")        # e.g. AP_ct_math
          .reset_index()
)

# Merge with the main aggregated table
ap_agg = ap_agg.merge(cat_counts, on="EmployeeID", how="left").fillna(0)

print("Shape after adding categories:", ap_agg.shape)
print("New columns (sample):", [c for c in ap_agg.columns if c.startswith('AP_ct_')][:10])
#ap_agg.head(5)


Shape after adding categories: (25926, 14)
New columns (sample): ['AP_ct_art', 'AP_ct_computer', 'AP_ct_english', 'AP_ct_language', 'AP_ct_math', 'AP_ct_social', 'AP_ct_stem']


In [32]:
# credit-earned counts and derived ratios

# Credit-earned counts by category
sheet2["credit_flag"] = (sheet2["EARN_CREDIT"] == "Y").astype(int)

cat_credit_counts = (
    sheet2.dropna(subset=["AP_SubjectCategory"])
          .pivot_table(
              index="EmployeeID",
              columns="AP_SubjectCategory",
              values="credit_flag",
              aggfunc="sum",
              fill_value=0
          )
          .add_prefix("AP_credit_ct_")
          .reset_index()
)

# merge into our main table
ap_agg = ap_agg.merge(cat_credit_counts, on="EmployeeID", how="left").fillna(0)

# Derived STEM summary features
stem_like = ["stem", "math", "computer"]
for col in [f"AP_ct_{c}" for c in stem_like]:
    if col not in ap_agg.columns:
        ap_agg[col] = 0

ap_agg["AP_ct_STEM_like"] = ap_agg[[f"AP_ct_{c}" for c in stem_like]].sum(axis=1)
ap_agg["AP_STEM_ratio"] = np.where(
    ap_agg["AP_num_tests"] > 0,
    ap_agg["AP_ct_STEM_like"] / ap_agg["AP_num_tests"],
    0
)

print("Shape after adding credit counts + ratios:", ap_agg.shape)
print("New credit columns (sample):", [c for c in ap_agg.columns if c.startswith('AP_credit_ct_')][:10])
#print(ap_agg[["EmployeeID","AP_ct_STEM_like","AP_STEM_ratio"]].head(5))
print(ap_agg[["AP_ct_STEM_like","AP_STEM_ratio"]].head(5))

Shape after adding credit counts + ratios: (25926, 23)
New credit columns (sample): ['AP_credit_ct_art', 'AP_credit_ct_computer', 'AP_credit_ct_english', 'AP_credit_ct_language', 'AP_credit_ct_math', 'AP_credit_ct_social', 'AP_credit_ct_stem']
   AP_ct_STEM_like  AP_STEM_ratio
0              0.0            0.0
1              0.0            0.0
2              0.0            0.0
3              0.0            0.0
4              0.0            0.0


In [33]:
# Save
BASE = Path("/content/drive/MyDrive/CP_UMBC/ Feature Engineering/AP_Features")
BASE.mkdir(parents=True, exist_ok=True)

csv_path = BASE / "ap_student_agg_without_DL.csv"
parquet_path = BASE / "ap_student_agg_without_DL.parquet"

ap_agg.to_csv(csv_path, index=False)
ap_agg.to_parquet(parquet_path, index=False)

print("\n Saved aggregated AP features:")
print("CSV    :", csv_path)
print("PARQUET:", parquet_path)
print("Shape  :", ap_agg.shape)


 Saved aggregated AP features:
CSV    : /content/drive/MyDrive/CP_UMBC/ Feature Engineering/AP_Features/ap_student_agg_without_DL.csv
PARQUET: /content/drive/MyDrive/CP_UMBC/ Feature Engineering/AP_Features/ap_student_agg_without_DL.parquet
Shape  : (25926, 23)
