In [10]:
import pandas as pd
import warnings

import numpy as np


warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 65)
pd.set_option('display.max_rows', 140)

In [11]:
#read in the raw dataset
grades = pd.read_csv('data_set_10_17.csv')

#a simple function to get info about the dataset
def gradestats(df):
    total = df["STUDENT"].nunique()
    grads = df.groupby("STUDENT")["DEG_CD"].apply(lambda x: x.notnull().any()).sum()
    percent = np.round(grads/total*100,2)
    print('Rows:', len(grades))
    print('Total:', total)
    print('Grads:', grads)
    print('Percent:', percent)

gradestats(grades)

Rows: 30394
Total: 13065
Grads: 7622
Percent: 58.34


In [12]:
#We will only consider students who initially enrolled between 2011 and 2020 inclusive. 
#The reason for this is that 2020 enrollment is the latest that allows for 8 semesters (4 yrs) of courses to be taken
#and earlier than 2011 data may have issues ie courses that are no longer offered, people that got a degree and returned years laters
grades = grades[grades['ENTRY_CCYY'] <= 2020]
grades = grades[grades['ENTRY_CCYY'] >= 2011]

print('After filtering for students who enrolled between 2011 and 2020')
gradestats(grades)

#dropped degrees are graduate degrees. kept degrees are undergraduate degrees.
dropped_degrees = ['MRE', 'MCP', 'MED', 'MFA', 'MTR', 'MHC', 'MFS', 'MHO', 'MAA', 'DVM', 'MBS', 'MBA', 'MA', 'MEN', 'MFN', 'MAT', 'PHD', 'MS']

#here I drop the graduate degrees
for x in dropped_degrees:
    grades = grades.drop(grades[grades['DEG_CD'] == x].index)

print('After dropping graduate degrees')
gradestats(grades)

After filtering for students who enrolled between 2011 and 2020
Rows: 24408
Total: 9870
Grads: 7250
Percent: 73.45
After dropping graduate degrees
Rows: 23248
Total: 9860
Grads: 7240
Percent: 73.43


In [13]:
#Next, we format the semester in which the student entered the university, took the course, and obtained a degree
#We will encode the semester as the last two digits of the year + 0.5 for fall and 0 for spring or summer
pairs = [['ENTRY_CCYY','ENTRY_SEM_CD'],
         ['SEM_CCYY','SEM_CD'],
         ['SEM_CCYY.1','SEM_CD.1']]

for ccyy, sem in pairs:
    #get the last two digits of the year
    grades[ccyy] = grades[ccyy] % 1000

    #encode the semesters as 0.5 for fall and 0 for spring or summer (value=1 in the dataset)
    grades[sem] = grades[sem].replace({'F': 0.5, '1': 0, 'S': 0}) 

    #combine the year and semester to get a unique identifier for each semester
    grades[sem] = grades[sem] + grades[ccyy]

#Next, we create columns that compute the semester in which the course was taken and the semester in which the student graduated
#create a new column named 'CRSE_SEM' that computes the semester in which the particular course was taken
grades['CRSE_SEM'] = (grades['SEM_CD'] - grades['ENTRY_SEM_CD'])*2 + 1

#create a new column named 'GRAD_CD' that computes the semester in which the student graduated
grades['GRAD_SEM'] = (grades['SEM_CD.1'] - grades['ENTRY_SEM_CD'])*2 + 1

#get a list of students with more than one degree
degs_per_student = grades.groupby('STUDENT')['GRAD_SEM'].nunique()
students_mult_degs = degs_per_student[degs_per_student > 1].index.tolist()

#Remove all rows corresponding to the later degrees
for student in students_mult_degs:
    #get the first degree
    first_degree = grades[grades['STUDENT'] == student]['GRAD_SEM'].min()

    #remove all rows corresponding to the later degrees
    grades = grades.drop(grades[(grades['STUDENT'] == student) & (grades['GRAD_SEM'] > first_degree)].index)

#remove all rows corresponding to students that gradated in less than 4 semesters or greater than 12 semesters (don't touch the rows with null values)
grades = grades.drop(grades[(grades['GRAD_SEM'] < 4) | (grades['GRAD_SEM'] > 12)].index)

#drop all rows for which CRSE_SEM is greater than 12 or greater than GRAD_SEM (if this occurs, it means that the student took the course after they graduated)
grades = grades.drop(grades[(grades['CRSE_SEM'] > 12) | (grades['CRSE_SEM'] > grades['GRAD_SEM'])].index)

#check the value counts of CRSE_SEM and GRAD_SEM
print(grades['CRSE_SEM'].value_counts().sort_index())
print(grades['GRAD_SEM'].value_counts().sort_index())
print('Number of null values in CRSE_SEM:', grades['CRSE_SEM'].isnull().sum())
print('Number of null values in GRAD_SEM:', grades['GRAD_SEM'].isnull().sum())

print('After filtering for students that graduated in 4 to 12 semesters, and removing second degrees')
gradestats(grades)

#In the next few lines, we edit matters relating to the CRSE column
#remove the 'X' appearing in any CRSE name
grades['CRSE'] = grades['CRSE'].str.replace('X', '')

#Remove courses with low enrollment, or having course number less than 100
crse_counts = grades['CRSE'].value_counts()
min_enrollment = 50
to_remove = crse_counts[crse_counts <= min_enrollment].index.to_list() + ['25','30']
grades = grades[~grades.CRSE.isin(to_remove)]

print('After removing courses with low enrollment or numbered < 100')
gradestats(grades)

CRSE_SEM
1.0     5323
2.0     4733
3.0     3024
4.0     2750
5.0     1846
6.0     1820
7.0     1266
8.0     1106
9.0      290
10.0     190
11.0      62
12.0      34
Name: count, dtype: int64
GRAD_SEM
4.0      376
5.0      445
6.0     1367
7.0     1634
8.0     9361
9.0     2186
10.0    1891
11.0     534
12.0     371
Name: count, dtype: int64
Number of null values in CRSE_SEM: 0
Number of null values in GRAD_SEM: 4279
After filtering for students that graduated in 4 to 12 semesters, and removing second degrees
Rows: 22444
Total: 9642
Grads: 7033
Percent: 72.94
After removing courses with low enrollment or numbered < 100
Rows: 21460
Total: 9414
Grads: 6905
Percent: 73.35


In [14]:
#Next, we format the 'GRADE_CATEGORY' columns as follows:
#"Above C-" and "Satisfactory/Pass" are replaced with a 1 and "C- or Below" is replaced with a -1
grades['GRADE_CATGORY'] = grades['GRADE_CATGORY'].replace({'C- OR BELOW': -1, 'ABOVE C-': 1, 'Satisfactory/Pass': 1})

#We add one column for each course and temporarily stick a 0 in each column
courses = [str(x) for x in grades['CRSE'].unique()]

for x in courses:
    grades[x] = 0    

#Now, in each row, if a student has taken a course x, then the value in the column 'x' is replaced with the product of the 'GRADE_CATEGORY' and 'CRSE_SEM' (so it ranges from -12 to 12)
for x in courses:
    grades[x][grades['CRSE'] == x] = grades['GRADE_CATGORY'] * grades['CRSE_SEM']

In [15]:
#drop columns that are not needed
grades = grades.drop(['LEVEL'],axis='columns')
grades = grades.drop(['OFFER_DEPT_ABRVN'],axis='columns')
grades = grades.drop(['MAJOR_COLL'],axis='columns')
grades = grades.drop(['DEG_DESCR'], axis='columns')
grades = grades.drop(['DEG_DATEDB'], axis='columns')
grades = grades.drop(['Workday Enrolled in Fall 2024'], axis='columns')
grades = grades.drop(['Workday Enrolled in Fall 2024 Class Standing'], axis='columns')
grades = grades.drop(['Workday Enrolled in Fall 2024 Primary Program of Study'], axis='columns')
grades = grades.drop(['MAJOR_CURR'], axis='columns')
grades = grades.drop(['MAJOR_CURR.2'], axis='columns')
grades = grades.drop(['MAJOR_CURR.1'], axis='columns')
grades = grades.drop(['CLSFN_YR.1'], axis='columns')
grades = grades.drop(['LAST_RGST_TERM'], axis='columns')
grades = grades.drop(['SEM_CD.1'], axis='columns')
grades = grades.drop(['SECT'], axis='columns')
grades = grades.drop(['CRSE'], axis='columns')
grades = grades.drop(['GRADE_CATGORY'], axis='columns')
grades = grades.drop(['CRSE_TITLE'], axis='columns')
grades = grades.drop(['ENTRY_CCYY'], axis='columns')
grades = grades.drop(['SEM_CCYY.1'], axis='columns')
grades = grades.drop(['DEG_CD'], axis='columns')
grades = grades.drop(['SEM_CD'], axis='columns')
grades = grades.drop(['SEM_CCYY'], axis='columns')
grades = grades.drop(['ENTRY_SEM_CD'], axis='columns')
grades = grades.drop(['CLSFN_YR'], axis='columns')
grades = grades.drop(['MAJOR_COLL.1'], axis='columns')

#check the columns that are left
finals_cols = grades.columns.tolist()
print(grades.columns,len(grades.columns))

grades

Index(['STUDENT', 'CRSE_SEM', 'GRAD_SEM', '265', '166', '165', '207', '140',
       '104', '385', '267', '317', '201', '150', '143', '105', '101', '266',
       '373', '341', '301', '414', '435', '365', '436', '492', '160', '314',
       '397', '495', '302', '151', '350', '490', '304', '240', '106', '181',
       '415', '497', '145', '342', '331'],
      dtype='object') 43


Unnamed: 0,STUDENT,CRSE_SEM,GRAD_SEM,265,166,165,207,140,104,385,267,317,201,150,143,105,101,266,373,341,301,414,435,365,436,492,160,314,397,495,302,151,350,490,304,240,106,181,415,497,145,342,331
0,0,3.0,8.0,-3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,3,1.0,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,4,1.0,8.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,4,2.0,8.0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,4,4.0,8.0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30374,13061,2.0,,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30375,13062,4.0,5.0,0,0,0,0,0,0,0,0,0,0,-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30376,13062,4.0,5.0,0,0,0,0,0,0,0,0,0,0,-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30377,13063,5.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0


In [16]:
#save the ungrouped dataset
grades.to_csv('ungrouped_dataset.csv', index=False)

In [17]:
#drop the CRSE_SEM column
grades = grades.drop(['CRSE_SEM'], axis='columns')

#replace all null values with 0
grades = grades.fillna(0)

#group by student, choose the value of each column whose absolute value is the greatest
grades = grades.groupby('STUDENT').agg(lambda x: x.loc[x.abs().idxmax()])

#Add a column 'y' whose value is 0 if 'GRAD_SEM' is null or greater than 8, and 1 otherwise
grades['y'] = grades['GRAD_SEM'].apply(lambda x: 0 if x==0 or x > 8 else 1)

#Finally, in the columns corresponding to the courses, we replace the values with 0 if the absolute value is greater than 8 (because we are only focussing on 8 semesters until graduation)
for x in courses:
    grades[x] = grades[x].apply(lambda x: 0 if abs(x) > 8 else x)

grades

Unnamed: 0_level_0,GRAD_SEM,265,166,165,207,140,104,385,267,317,201,150,143,105,101,266,373,341,301,414,435,365,436,492,160,314,397,495,302,151,350,490,304,240,106,181,415,497,145,342,331,y
STUDENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
0,8.0,-3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8.0,0,2,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,0.0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,7.0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13059,4.0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
13060,10.0,0,0,0,0,-1,0,0,0,0,0,-2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13061,0.0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13062,5.0,0,0,0,0,0,0,0,0,0,0,-4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [18]:
#save the grouped dataset
grades.to_csv('grouped_dataset.csv')

In [19]:
math_crses_dict = {
    101: "ORIENTATION IN MATH",
    201: "INTRO TO PROOFS",
    240: "INVESTM&CREDIT MATH",
    301: "ABSTRACT ALGEBRA I",
    304: "COMBINATORICS",
    314: "GRAPH THEORY",
    350: "NUMBER THEORY",
    365: "COMPLEX VARIABLES",
    373: "INTR SCIENTF CMPTNG",
    385: "INTR PART DIFF EQUA",
    397: "IV MATH TCH SECDY",
    403: "ABSTRCT ALGEBRA II",
    414: "ANALYSIS I",
    415: "ANALYSIS II",
    435: "GEOMETRY I",
    436: "GEOMETRY II",
    481: "NUMRC MTHDS DIFF EQ",
    497: "TEACH SEC SCHL MATH"
}

math_crses = [
    101,
    201,
    240,
    301,
    304,
    314,
    350,
    365,
    373,
    385,
    397,
    403,
    414,
    415,
    435,
    436,
    481,
    497
]

gen_crses = [
    105,
    140,
    143,
    145,
    150,
    151,
    160,
    165,
    166,
    207,
    265,
    266,
    267
]

gen_crses_dict = {
    105: "INTRO TO MATH IDEA",
    140: "COLLEGE ALGEBRA",
    143: "PREP FOR CALCULUS",
    145: "APPLD TRIGONOMETR",
    150: "DISC MATH BUS&SOC S",
    151: "BUS & SOC SCI CALC",
    160: "SURVEY OF CALCULUS",
    165: "CALCULUS I",
    166: "CALCULUS II",
    207: "MATRCES&LINEAR ALGB",
    265: "CALCULUS III",
    266: "ELEM DIFF EQUATNS",
    267: "DIFF EQ & TRANSFMS"
}


In [21]:
print('Number of gen courses:', len(gen_crses))
print('Number of math courses:', len(math_crses))

Number of gen courses: 13
Number of math courses: 18
