In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
init_dta = pd.read_csv("formatted_data.csv", low_memory=False)

In [3]:
###Code from Varma to convert SATs
# List of score columns
score_columns = ['ACTE', 'ACTM', 'ACTR', 'ACTS', 'EACT', 'SAT-ERW', 'SATM', 'SAT_TOTAL', 'HS GPA']

# Convert score columns to numeric
for col in score_columns:
    init_dta[col] = pd.to_numeric(init_dta[col], errors='coerce')

# ACT to SAT conversion table
act_to_sat_conversion = {
    36: 1590, 35: 1540, 34: 1500, 33: 1460, 32: 1430, 31: 1400,
    30: 1370, 29: 1340, 28: 1310, 27: 1280, 26: 1240, 25: 1210,
    24: 1180, 23: 1140, 22: 1110, 21: 1080, 20: 1040, 19: 1010,
    18: 970, 17: 930, 16: 890, 15: 850, 14: 800, 13: 760,
    12: 710, 11: 670, 10: 630, 9: 590
}

# Convert EACT to SAT
def convert_act_to_sat(eact_score):
    if pd.isna(eact_score):
        return np.nan
    return act_to_sat_conversion.get(int(eact_score), np.nan)

# Apply the conversion to EACT scores
init_dta['Converted_SAT'] = init_dta['SAT_TOTAL']

# Identify where SAT_TOTAL is missing but EACT is available
mask = init_dta['SAT_TOTAL'].isna() & init_dta['EACT'].notna()

# Apply conversion
init_dta.loc[mask, 'Converted_SAT'] = init_dta.loc[mask, 'EACT'].apply(convert_act_to_sat)

# Step 4: Handle Remaining Missing Values

# Drop rows where Converted_SAT or Final_GPA is still NaN
datawithsat = init_dta.dropna(subset=['Converted_SAT', 'HS GPA'])

In [4]:
datawithsat.head()

Unnamed: 0,Pidm,Semester,Admit_Level,Admit_College,Lastest_Major,Trump_Race,Trump_Race_Desc,MULTI,Race,NEW_ETHNICITY,...,SAT-ERW,SATM,SAT_TOTAL,Final GPA,Semester GPA,Semester Grades,Semester Points,Classes,CRN,Converted_SAT
4,794,201508,UG,SA,Pre-Nursing,W,White,,W White,Not Hispanic or Latino,...,,,,3.98,4.0,"['A+', 'A+', 'A+']","[4.0, 4.0, 4.0]","['NUR3805', 'NUR3078', 'NUR4895']","[83267.0, 89943.0, 94441.0]",1210.0
5,794,201601,UG,SA,Pre-Nursing,W,White,,W White,Not Hispanic or Latino,...,,,,3.98,4.0,"['A+', 'A+']","[4.0, 4.0]","['NUR4828C', 'NUR4128']","[22277.0, 22671.0]",1210.0
6,794,201605,UG,SA,Pre-Nursing,W,White,,W White,Not Hispanic or Latino,...,,,,3.98,4.0,"['A+', 'A+']","[4.0, 4.0]","['NUR4169C', 'NUR4795']","[55333.0, 56745.0]",1210.0
7,794,201608,UG,SA,Pre-Nursing,W,White,,W White,Not Hispanic or Latino,...,,,,3.98,4.0,"['A+', 'A+']","[4.0, 4.0]","['NUR4069', 'NSP4614']","[89488.0, 94546.0]",1210.0
8,794,201701,UG,SA,Pre-Nursing,W,White,,W White,Not Hispanic or Latino,...,,,,3.98,4.0,"['A+', 'A+']","[4.0, 4.0]","['NUR4634C', 'NSP4886']","[22741.0, 24245.0]",1210.0


In [5]:
features_df = datawithsat[['Pidm', 'Semester', 'HS GPA', 'Converted_SAT', 'Semester Points', 'Classes']]
#Ensure Arrays are formatted as arrays
features_df.loc[:, 'Classes'] = features_df['Classes'].apply(ast.literal_eval)
features_df.loc[:, 'Semester Points'] = features_df['Semester Points'].apply(ast.literal_eval)
features_df.head()

Unnamed: 0,Pidm,Semester,HS GPA,Converted_SAT,Semester Points,Classes
4,794,201508,3.3,1210.0,"[4.0, 4.0, 4.0]","[NUR3805, NUR3078, NUR4895]"
5,794,201601,3.3,1210.0,"[4.0, 4.0]","[NUR4828C, NUR4128]"
6,794,201605,3.3,1210.0,"[4.0, 4.0]","[NUR4169C, NUR4795]"
7,794,201608,3.3,1210.0,"[4.0, 4.0]","[NUR4069, NSP4614]"
8,794,201701,3.3,1210.0,"[4.0, 4.0]","[NUR4634C, NSP4886]"


In [6]:
# #Lets try CHM2210

# features_df.loc[:, 'Classes'] = features_df['Classes'].apply(ast.literal_eval)
# features_df.loc[:, 'Semester Points'] = features_df['Semester Points'].apply(ast.literal_eval)

# when_taken = features_df[features_df['Classes'].apply(lambda x: 'CHM2210' in x)]
# when_taken = [(x[0], x[1]) for x in zip(when_taken['Pidm'], when_taken['Semester'])]
# when_taken

In [7]:
# prev_classes = []
# prev_grades = []
# for Pidm, Semester in when_taken:
#     all_classes = features_df[(features_df['Pidm'] == Pidm) & (features_df['Semester'] < Semester)]
#     prev_classes.append(sum(all_classes['Classes'], []))
#     prev_grades.append(sum(all_classes['Semester Points'], []))

In [8]:
# unique_classes = sorted(set(sum(prev_classes, [])))
# class_to_index = {classes: idx for idx, classes in enumerate(unique_classes)}




# encoded_rows = [vector_to_custom_hot(vector, values, unique_classes) for vector, values in zip(prev_classes, prev_grades)]

# df = pd.DataFrame(encoded_rows, columns=unique_classes)




In [9]:
# df

In [10]:
# df.to_csv('test.csv')

In [11]:
def vector_to_custom_hot(vector, values, unique_classes, indices):
    custom_hot_vector = np.zeros(len(unique_classes), dtype=float)
    for string, value in zip(vector, values):
        custom_hot_vector[indices[string]] = value
    return custom_hot_vector

def build_class_vect(course, features_df, class_bound):
    #Find all Pidms and Semesters where Course was taken
    when_taken = features_df[features_df['Classes'].apply(lambda x: course in x)]
    when_taken = [(x[0], x[1]) for x in zip(when_taken['Pidm'], when_taken['Semester'])]
    
    #Find all classes taken before course for each Pidm + Store student info
    prev_classes = []
    prev_grades = []
    student_info = []
    # for idx, row in when_taken.iterrows():
    for Pidm, Semester in when_taken:
        # Pidm = row['Pidm']
        # Semester = row['Semester']
        # HS_GPA = row['HS GPA']
        # Converted_SAT = row['Converted_SAT']
        # student_info.append((Pidm, HS_GPA, Converted_SAT))
        all_classes = features_df[(features_df['Pidm'] == Pidm) & (features_df['Semester'] < Semester)]
        prev_classes.append(sum(all_classes['Classes'], []))
        prev_grades.append(sum(all_classes['Semester Points'], []))
        
    #Filter Array by class_bound
    students_taken = {}
    for row in prev_classes:
        for value in row:
            students_taken[value] = students_taken.get(value, 0) + 1
    for i in range(len(prev_classes)):
        prev_classes[i], prev_grades[i] = [
            list(t) for t in zip(*[
                (v1, v2) for v1, v2 in zip(prev_classes[i], prev_grades[i]) if students_taken[v1] >= class_bound
            ]) 
        ] if prev_classes[i] else ([], [])
        
    #Find all unique classes in previously taken classes and get index of each
    unique_classes = sorted(set(sum(prev_classes, [])))
    class_to_index = {classes: idx for idx, classes in enumerate(unique_classes)}
    
    #Create n-hot encoding for each class taken for each Pidm, where n = gpa points for each class
    encoded_rows = [vector_to_custom_hot(vector, values, unique_classes, class_to_index) for vector, values in zip(prev_classes, prev_grades)]
    #     [Pidm, hsGpa, sat] + vector_to_custom_hot(vector, values, unique_classes, class_to_index).tolist()
    #     for (Pidm, HS_GPA, Converted_SAT), vector, values in zip(student_info, prev_classes, prev_grades)
    # ]
    return encoded_rows, unique_classes

In [None]:
encoded_vect, unique_classes = build_class_vect('CHM2210', features_df, 50)
df = pd.DataFrame(encoded_vect, columns=unique_classes)
df.to_csv('test2.csv')