In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from pathlib import Path
pd.set_option('display.max_rows', 25990)
pd.set_option('display.max_columns', 200)

In [3]:

BASE = Path("/content/drive/MyDrive/CP_UMBC/Raw_Data")
FILE = BASE / "RT2733808 Fin aid stdnt Success analysis data Sp2025.xlsx"

xls = pd.ExcelFile(FILE)
print("Available sheets:", xls.sheet_names)


Available sheets: ['FA Stdnt Success ', 'AP Courses']


In [4]:
# Load both sheets
sheet1 = pd.read_excel(FILE, sheet_name=xls.sheet_names[0])

In [5]:
sheet1.columns

Index(['StudentKey', 'EmployeeID', 'MatricTermdescription',
       'MatricAcademicYear', 'MatricStatusOfficialDescr', 'MatricGenderIPEDS',
       'MatricIPEDSEthnicity', 'Zipcode', 'MatricResidence', 'Graduated',
       ...
       'HS_PecentileDesc', 'HasAlgScore', 'AlgSCORE', 'HasCalScore',
       'CalScore', 'HasALEKSscore', 'ALEKSScore', 'HasEngScore', 'EngSCORE',
       'AP_CRDS'],
      dtype='object', length=131)

In [6]:
sheet1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25960 entries, 0 to 25959
Columns: 131 entries, StudentKey to AP_CRDS
dtypes: float64(54), int64(18), object(59)
memory usage: 25.9+ MB


In [7]:
import numpy as np
sheet1 = sheet1.copy()

In [8]:
# Define yearly column groups
need_cols  = [f"Y{i}Need" for i in range(1, 7)]
grant_cols = [f"Y{i}GrantAmount" for i in range(1, 7)]
merit_cols = [f"Y{i}MeritAmount" for i in range(1, 7)]

# Total Support = sum of all grant + merit amounts (Y1â€“Y6)
sheet1["TotalSupport"] = sheet1[grant_cols + merit_cols].sum(axis=1, skipna=True)

#  Supported flag (Yes/No)
sheet1["Supported"] = np.where(sheet1["TotalSupport"] > 0, "Yes", "No")

#  NeedStatus (Yes/No if any year shows real financial need)
sheet1["NeedStatus"] = np.where(
    sheet1[need_cols].apply(
        lambda row: any(
            (isinstance(val, str) and val.strip().lower() not in ["no fafsa", "unknown", "none", ""])
            for val in row
        ),
        axis=1
    ),
    "Yes",
    "No"
)

#   Bucket TotalSupport into ranges
bins = [-0.1, 0, 5000, 10000, 15000, 20000, np.inf]
labels = ["NoSup", "<5K", "5K-10K", "10K-15K", "15K-20K", ">20K"]
sheet1["SupportBin"] = pd.cut(sheet1["TotalSupport"], bins=bins, labels=labels)

print(sheet1[["EmployeeID", "TotalSupport", "Supported", "NeedStatus", "SupportBin"]].head(10))


   EmployeeID  TotalSupport Supported NeedStatus SupportBin
0  2000058476           0.0        No        Yes      NoSup
1  2000067360           0.0        No         No      NoSup
2  2000077103        5000.0       Yes         No        <5K
3  2000107627           0.0        No         No      NoSup
4  2000037534       39711.0       Yes        Yes       >20K
5  2000037570       21015.0       Yes        Yes       >20K
6  2000037804           0.0        No        Yes      NoSup
7  2000059165           0.0        No        Yes      NoSup
8  1000011230           0.0        No        Yes      NoSup
9  1000011462        1500.0       Yes        Yes        <5K


In [9]:
support_bin_stats = (
    sheet1.groupby("SupportBin")
    .agg(
        Count=("SupportBin", "size"),
        Graduated_Count=("Graduated", lambda x: (x == "Yes").sum()),
        Graduation_Rate=("Graduated", lambda x: (x == "Yes").mean())
    )
    .sort_index()
)

print(support_bin_stats)


            Count  Graduated_Count  Graduation_Rate
SupportBin                                         
NoSup        8485             4237         0.499352
<5K          4106             1164         0.283488
5K-10K       2385              781         0.327463
10K-15K      2118              874         0.412653
15K-20K      1538              939         0.610533
>20K         7328             4860         0.663210


  sheet1.groupby("SupportBin")


In [11]:
from pathlib import Path

BASE = Path("/content/drive/MyDrive/CP_UMBC/ Feature Engineering/Aid_Features")

csv_path = BASE / "aid_second.csv"
parquet_path = BASE / "aid_second.parquet"

sheet1.to_csv(csv_path, index=False)
sheet1.to_parquet(parquet_path, index=False)

print(" Saved enriched sheet1:")
print("CSV    :", csv_path)
print("PARQUET:", parquet_path)
print("Shape  :", sheet1.shape)


 Saved enriched sheet1:
CSV    : /content/drive/MyDrive/CP_UMBC/ Feature Engineering/Aid_Features/aid_second.csv
PARQUET: /content/drive/MyDrive/CP_UMBC/ Feature Engineering/Aid_Features/aid_second.parquet
Shape  : (25960, 135)
