In [1]:
import pandas as pd
import numpy as np

student_data = pd.read_csv("ML - Curricular Analytics - PIDM ONLY & Fixed Repeat IND.csv", low_memory=False)
grades = pd.read_csv("parsed_grades.csv")

In [2]:
#Add Numeric GPA Column

# Merge the two dataframes to bring in the Quality Points and whether to count in GPA
student_data = pd.merge(student_data, grades[['Code', 'Quality Points', 'Count in GPA?']], 
              left_on='FINAL_GRADE', right_on='Code', how='left', suffixes=('', '_grades'))

# Fill missing 'Quality Points' with 0.0 for unrecognized grades
student_data['Quality Points'] = student_data['Quality Points'].fillna(0.0)
student_data['Count_in_GPA'] = student_data['Count in GPA?'] == 'Y'


# student_data.shape
student_data.head()

Unnamed: 0,Pidm,Admit_Code,Admit_Desc,Admit_Term,Admit_Level,Admit_College,Admit_Major_Code,Major_Desc,Trump_Race,Trump_Race_Desc,...,Term,CRN,SUBJ,CRSE_NUMB,REPEAT_IND,FINAL_GRADE,Code,Quality Points,Count in GPA?,Count_in_GPA
0,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,202101.0,10803.0,LIS,3361,,A,A,4.0,Y,True
1,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,202101.0,16614.0,LIS,3353,,A-,A-,3.67,Y,True
2,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,202101.0,24972.0,COP,2030,,B+,B+,3.33,Y,True
3,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,202101.0,26205.0,CIS,4510,,A,A,4.0,Y,True
4,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,202105.0,54564.0,LIS,4365,,W,,0.0,,False


In [3]:
#Final and Semester GPAs (Assuming all classes are equal credits)

student_data['Valid_Grades'] = np.where(student_data['Count in GPA?'] == 'Y' , student_data['Quality Points'], np.nan)
student_final_gpa = student_data.groupby('Pidm')['Valid_Grades'].mean().reset_index()
student_data = student_data.merge(student_final_gpa, on='Pidm', how='left', suffixes=('', '_mean'))
student_data.rename(columns={'Valid_Grades_mean':'Final GPA'}, inplace=True)

student_semester_gpa = student_data.groupby(['Pidm', 'Term'])['Valid_Grades'].mean().reset_index()
student_data = student_data.merge(student_semester_gpa, on=['Pidm', 'Term'], how='left', suffixes=('', '_mean'))
student_data.rename(columns={'Valid_Grades_mean':'Semester GPA'}, inplace=True)

student_data.drop(['Code', 'Count in GPA?', 'Count_in_GPA', 'Valid_Grades'], axis=1, inplace=True)


student_data.head()

Unnamed: 0,Pidm,Admit_Code,Admit_Desc,Admit_Term,Admit_Level,Admit_College,Admit_Major_Code,Major_Desc,Trump_Race,Trump_Race_Desc,...,SAT_TOTAL,Term,CRN,SUBJ,CRSE_NUMB,REPEAT_IND,FINAL_GRADE,Quality Points,Final GPA,Semester GPA
0,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,202101.0,10803.0,LIS,3361,,A,4.0,3.75,3.75
1,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,202101.0,16614.0,LIS,3353,,A-,3.67,3.75,3.75
2,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,202101.0,24972.0,COP,2030,,B+,3.33,3.75,3.75
3,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,202101.0,26205.0,CIS,4510,,A,4.0,3.75,3.75
4,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,202105.0,54564.0,LIS,4365,,W,0.0,3.75,


In [4]:
#Student Classes & Points per Semester (As an array of strings)

student_data['class'] = (student_data['SUBJ'] + student_data['CRSE_NUMB']).astype(str)
semester_classes = student_data.groupby(['Pidm', 'Term']).agg({
    'FINAL_GRADE': list,
    'Quality Points': list, 
    'class': list
}).reset_index()


semester_classes.head()
# student_data.shape

Unnamed: 0,Pidm,Term,FINAL_GRADE,Quality Points,class
0,285,202101.0,"[A, A-, B+, A]","[4.0, 3.67, 3.33, 4.0]","[LIS3361, LIS3353, COP2030, CIS4510]"
1,285,202105.0,[W],[0.0],[LIS4365]
2,432,201908.0,"[IF, F, F]","[0.0, 0.0, 0.0]","[MUS4930, CCJ3117, IDS4934]"
3,705,202208.0,"[A, A+, A, A, A, A]","[4.0, 4.0, 4.0, 4.0, 4.0, 4.0]","[MCB2000, MCB2000L, HUN2201, BSC2085, BSC2085L..."
4,794,201508.0,"[A+, A+, A+]","[4.0, 4.0, 4.0]","[NUR3805, NUR3078, NUR4895]"


In [5]:
student_data.drop(['CRN', 'SUBJ', 'CRSE_NUMB', 'REPEAT_IND', 'FINAL_GRADE', 'class'], axis=1, inplace=True)
# student_data.head()
student_data.shape
# display(student_data[student_data["Pidm"] == 3963])

(2677409, 31)

In [6]:
student_data.head()

Unnamed: 0,Pidm,Admit_Code,Admit_Desc,Admit_Term,Admit_Level,Admit_College,Admit_Major_Code,Major_Desc,Trump_Race,Trump_Race_Desc,...,ACTR,ACTS,EACT,SAT-ERW,SATM,SAT_TOTAL,Term,Quality Points,Final GPA,Semester GPA
0,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,-,-,-,-,-,202101.0,4.0,3.75,3.75
1,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,-,-,-,-,-,202101.0,3.67,3.75,3.75
2,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,-,-,-,-,-,202101.0,3.33,3.75,3.75
3,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,-,-,-,-,-,202101.0,4.0,3.75,3.75
4,285,FS,Former USF Student Returning,202101,UG,SA,BGS,General Studies,W,White,...,-,-,-,-,-,-,202105.0,0.0,3.75,


In [7]:
student_data = student_data.groupby(['Pidm', 'Term']).agg({ 
    'Admit_Code': 'first', 
    'Admit_Level': 'first', 
    'Admit_College': 'first', 
    'Admit_Major_Code': 'first', 
    'Major_Desc': 'first', 
    'Trump_Race': 'first', 
    'Trump_Race_Desc': 'first', 
    'MULTI': 'first', 
    'Race': 'first', 
    'NEW_ETHNICITY': 'first', 
    'GENDER_Code': 'first', 
    'GENDER': 'first', 
    'CITZ_IND': 'first', 
    'CITZ_CODE': 'first', 
    'CITZ_DESC': 'first', 
    'Final_GPA': 'first', 
    'ACTE': 'first', 
    'ACTM': 'first', 
    'ACTR': 'first', 
    'ACTS': 'first', 
    'EACT': 'first', 
    'SAT-ERW': 'first', 
    'SATM': 'first', 
    'SAT_TOTAL': 'first', 
    'Final GPA': 'first',
    'Semester GPA': 'first'
}).reset_index()

student_data.shape



(672159, 28)

In [8]:
student_data = student_data.merge(semester_classes[['Pidm', 'Term', 'FINAL_GRADE', 'Quality Points', 'class']], on=['Pidm', 'Term'], how='left')
student_data.rename(columns={'Final_GPA':'HS GPA', 'Term':'Semester','FINAL_GRADE':'Semester Grades', 'Quality Points':'Semester Points', 'class':'Classes'}, inplace=True)



student_data.head()
# student_data.shape

Unnamed: 0,Pidm,Semester,Admit_Code,Admit_Level,Admit_College,Admit_Major_Code,Major_Desc,Trump_Race,Trump_Race_Desc,MULTI,...,ACTS,EACT,SAT-ERW,SATM,SAT_TOTAL,Final GPA,Semester GPA,Semester Grades,Semester Points,Classes
0,285,202101.0,FS,UG,SA,BGS,General Studies,W,White,,...,-,-,-,-,-,3.75,3.75,"[A, A-, B+, A]","[4.0, 3.67, 3.33, 4.0]","[LIS3361, LIS3353, COP2030, CIS4510]"
1,285,202105.0,FS,UG,SA,BGS,General Studies,W,White,,...,-,-,-,-,-,3.75,,[W],[0.0],[LIS4365]
2,432,201908.0,FS,UG,SA,BGS,General Studies,B,Black or African American,Multi-Race,...,-,-,-,-,-,0.0,0.0,"[IF, F, F]","[0.0, 0.0, 0.0]","[MUS4930, CCJ3117, IDS4934]"
3,705,202208.0,SB,UG,NR,PNR,Pre-Nursing,W,White,,...,29,30,-,-,-,4.0,4.0,"[A, A+, A, A, A, A]","[4.0, 4.0, 4.0, 4.0, 4.0, 4.0]","[MCB2000, MCB2000L, HUN2201, BSC2085, BSC2085L..."
4,794,201508.0,SB,UG,SA,PNR,Pre-Nursing,W,White,,...,22,25,-,-,-,3.977241,4.0,"[A+, A+, A+]","[4.0, 4.0, 4.0]","[NUR3805, NUR3078, NUR4895]"


In [9]:
student_data = student_data.astype(str)
student_data['Pidm'] = student_data['Pidm'].astype(int)
student_data['Final GPA'] = student_data['Final GPA'].astype(float).round(2)
student_data['Semester GPA'] = student_data['Semester GPA'].astype(float).round(2)
student_data['Semester'] = student_data['Semester'].astype(float).astype(int).astype(str)
student_data.to_csv("formatted_data.csv", index=False)

In [10]:
semester_classes.head()

Unnamed: 0,Pidm,Term,FINAL_GRADE,Quality Points,class
0,285,202101.0,"[A, A-, B+, A]","[4.0, 3.67, 3.33, 4.0]","[LIS3361, LIS3353, COP2030, CIS4510]"
1,285,202105.0,[W],[0.0],[LIS4365]
2,432,201908.0,"[IF, F, F]","[0.0, 0.0, 0.0]","[MUS4930, CCJ3117, IDS4934]"
3,705,202208.0,"[A, A+, A, A, A, A]","[4.0, 4.0, 4.0, 4.0, 4.0, 4.0]","[MCB2000, MCB2000L, HUN2201, BSC2085, BSC2085L..."
4,794,201508.0,"[A+, A+, A+]","[4.0, 4.0, 4.0]","[NUR3805, NUR3078, NUR4895]"
