In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [2]:
# globals
seed = 42

In [None]:
# instantiate resources json
with open("./resources/student_info.json", "r") as file:
    resources = json.load(file)

# instantiate list of categorical features
categorical = resources['categorical']

# instantiate list of numerical features
numerical = resources['numerical']

# instantiate list of boolean features
booleans = resources['boolean'].astype('bool')

# instantiate list of full feature set
features = categorical + booleans + numerical

In [4]:
# instantiate categorical and numerical dataframes
df = pd.read_csv('./data/hsls_17_student_pets_sr_v1_0.csv', usecols=features)
df_categorical = df[categorical]
df_numerical = df[numerical]
df_boolean = df[booleans]

In [5]:
# feature descriptions
print('Feature Descriptions:\n')
for feature, description in resources['descriptions'].items():
    print(f"{feature}: {description}")

Feature Descriptions:

X1TXMTSCOR: X1 Mathematics standardized theta score
X2TXMTSCOR: X2 Mathematics standardized theta score
X1PAREDU: X1 Parents'/guardians' highest level of education
X2PAREDU: X2 Parents'/guardians' highest level of education
X1MTHINT: X1 Scale of student's interest in fall 2009 math course
X2MTHINT_R: X2 Scale of student's interest in fall 2009 math course (REVISED)
X1SCIINT: X1 Scale of student's interest in fall 2009 science course
X2SCIINT: X2 Scale of student's interest in fall 2009 science course
X2BEHAVEIN: X2 Scale of school motivation
X1SES: X1 Socio-economic status composite
X4X2SES: X4 Revised X2 Socio-economic status composite
X3TGPAHIMTH: X3 GPA - highest level mathematics course taken
X3TGPAHISCI: X3 GPA - highest level science course taken
X3TGPASTEM: X3 GPA for STEM courses
X3TGPAACAD: X3 GPA for academic courses
X5GPAALL: Postsecondary Transcript: GPA at all known institutions attended
A1TCHRABSENT: A1 E17D Teacher absenteeism is a problem at this 

In [6]:
# feature questions (where applicable)
print('Survey Questions:\n')
for feature, question in resources['questions'].items():
    print(f"{feature}: \n{question}\n")

Survey Questions:

X2PAREDU: 
1 = "Less than high school"
2 = "High school diploma or GED or alterntive HS credential"
3 = "Certificate/diploma from school providing occupational training"
4 = "Associate's degree"
5 = "Bachelor's degree"
6 = "Master's degree"
7 = "Ph.D/M.D/Law/other high lvl prof degree"
-8 = "Unit non-response"

X1PAREDU: 
1 = "Less than high school"
2 = "High school diploma or GED"
3 = "Associate's degree"
4 = "Bachelor's degree"
5 = "Master's degree"
7 = "Ph.D/M.D/Law/other high lvl prof degree"
-8 = "Unit non-response"
-9 = "Missing"

S2LATESCH: 
1 = "Never"
2 = "1-2 times"
3 = "3-6 times"
4 = "7-9 times"
5 = "10 or more times"
-8 = "Unit non-response"
-9 = "Missing"

S2ABSENT: 
1 = "Never"
2 = "1-2 times"
3 = "3-6 times"
4 = "7-9 times"
5 = "10 or more times"
-8 = "Unit non-response"
-9 = "Missing"

S2WOHWDN: 
1 = "Never"
2 = "1-2 times"
3 = "3-6 times"
4 = "7-9 times"
5 = "10 or more times"
-8 = "Unit non-response"
-9 = "Missing"

S2WOPAPER: 
1 = "Never"
2 = "1-2

In [7]:
# validation: number of values less than 0.0 in the X5GPAALL categorical column (post-secondary GPA)
print((df_categorical['X5GPAALL'] < 0.0).sum())

10915


In [8]:
# define target feature GPA range categories

# Convert the 'X5GPAALL' column to numeric, coercing errors to NaN
df_categorical.loc[:, 'X5GPAALL'] = pd.to_numeric(df_categorical['X5GPAALL'], errors='coerce')

# Define the bins (edges of the intervals)
bins = [0.0, 1.9, 2.9, 3.4, 4.0]

# Define the labels for the corresponding bins
labels = ['failing', 'average', 'good', 'excellent']

# Use pd.cut to categorize the data, setting values less than 0.0 to NaN beforehand
df_categorical.loc[:, 'X5GPAALL'] = pd.cut(
    df_categorical.loc[:, 'X5GPAALL'].where(df_categorical['X5GPAALL'] >= 0.0),
    bins=bins,
    labels=labels,
    right=True,  # Intervals are (bin_left, bin_right], adjust as needed
    include_lowest=True  # Include the lowest bin edge (0.0 in this case)
)

# Validate results
print(f'{df_categorical["X5GPAALL"].value_counts()}\n')
print(f'NaN X5GPAALL values:\n {df_categorical["X5GPAALL"].isnull().sum()}')

X5GPAALL
average      4107
good         3164
excellent    2899
failing      2418
Name: count, dtype: int64

NaN X5GPAALL values:
 10915


Length: 23503
Categories (4, object): ['failing' < 'average' < 'good' < 'excellent']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_categorical.loc[:, 'X5GPAALL'] = pd.cut(
