In [173]:
import pandas as pd
import warnings

import numpy as np


warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 65)
pd.set_option('display.max_rows', 140)

In [174]:
#read in the raw dataset
df = pd.read_csv('data_set_10_17.csv')

#a simple function to get info about the dataset
def gradestats(df):
    total = df["STUDENT"].nunique()
    grads = df.groupby("STUDENT")["DEG_CD"].apply(lambda x: x.notnull().any()).sum()
    percent = np.round(grads/total*100,2)
    print('Rows:', len(df))
    print('Total:', total)
    print('Grads:', grads)
    print('Percent:', percent)
    print()

gradestats(df)

Rows: 30394
Total: 13065
Grads: 7622
Percent: 58.34



In [175]:
cols = df.columns.to_list()
cols

['STUDENT',
 'MAJOR_CURR',
 'MAJOR_COLL',
 'CLSFN_YR',
 'ENTRY_SEM_CD',
 'ENTRY_CCYY',
 'LEVEL',
 'SEM_CCYY',
 'SEM_CD',
 'OFFER_DEPT_ABRVN',
 'CRSE',
 'SECT',
 'GRADE_CATGORY',
 'CRSE_TITLE',
 'SEM_CCYY.1',
 'SEM_CD.1',
 'DEG_CD',
 'DEG_DATEDB',
 'MAJOR_CURR.1',
 'DEG_DESCR',
 'LAST_RGST_TERM',
 'CLSFN_YR.1',
 'MAJOR_CURR.2',
 'MAJOR_COLL.1',
 'Workday Enrolled in Fall 2024',
 'Workday Enrolled in Fall 2024 Class Standing',
 'Workday Enrolled in Fall 2024 Primary Program of Study']

In [176]:
#We will only consider students who initially enrolled between 2011 and 2020 inclusive. 
#The reason for this is that 2020 enrollment is the latest that allows for 8 semesters (4 yrs) of courses to be taken
#and earlier than 2011 data may have issues ie courses that are no longer offered, people that got a degree and returned years laters
df = df[df['ENTRY_CCYY'] <= 2020]
df = df[df['ENTRY_CCYY'] >= 2011]

print('After filtering for students who enrolled between 2011 and 2020')
gradestats(df)

#dropped degrees are graduate degrees. kept degrees are undergraduate degrees.
dropped_degrees = ['MRE', 'MCP', 'MED', 'MFA', 'MTR', 'MHC', 'MFS', 'MHO', 'MAA', 'DVM', 'MBS', 'MBA', 'MA', 'MEN', 'MFN', 'MAT', 'PHD', 'MS']

#here I drop the graduate degrees
for x in dropped_degrees:
    df = df.drop(df[df['DEG_CD'] == x].index)

print('After dropping graduate degrees')
gradestats(df)

After filtering for students who enrolled between 2011 and 2020
Rows: 24408
Total: 9870
Grads: 7250
Percent: 73.45

After dropping graduate degrees
Rows: 23248
Total: 9860
Grads: 7240
Percent: 73.43



In [177]:
check = df[['STUDENT',
                'CRSE',
                'SEM_CCYY',
                'SEM_CD'
                ]]
#check how many duplicate rows are there in the dataset
check.duplicated().sum()

np.int64(1431)

In [178]:
#This means that there 1431 rows that are duplicates (in the sense that they correspond to the same course, semester, and student)
#get the indices of the duplicate rows and drop them from the 'df' dataset
duplicates = check[check.duplicated()].index
df.drop(duplicates, inplace=True)

gradestats(df)

Rows: 21817
Total: 9860
Grads: 7240
Percent: 73.43



In [179]:
#remove the 'X' appearing in any CRSE name
df['CRSE'] = df['CRSE'].str.replace('X', '')

#print the value counts in the CRSE column
df['CRSE'].value_counts()

CRSE
165     2889
166     2399
265     1679
143     1603
140     1356
104     1228
150     1109
207      972
201      758
317      688
267      676
105      541
301      480
414      475
266      419
160      362
101      335
492      316
341      289
385      236
435      212
436      186
30       174
314      172
304      138
373      131
397      122
350      119
25       112
151      112
495      102
365      101
415       96
490       85
181       81
497       76
342       70
106       65
302       63
240       56
145       54
331       42
481       41
265H      38
202       37
407       35
142       34
166H      34
195       32
182       29
424       25
403       24
474       20
439       19
196       19
504       18
515       18
502       15
511       14
421       14
491       14
501       12
505       10
408       10
510       10
441        9
442        7
601        5
139        5
617        5
422        5
469        5
423        4
533        4
516        4
566        4
506    

In [180]:
#for x = '302' and '403', print all the unique values of CRSE_TITLE occuring in df
for x in ['302', '403']:
    print(df[df['CRSE'] == x]['CRSE_TITLE'].unique())

['ABSTRCT ALGEBRA II']
['INTMED ABSTRCT ALGB']


In [181]:
#these are supposed to be the same, so re-number the '403' to '302'
df['CRSE'] = df['CRSE'].replace('403', '302')

In [182]:
#Course to be dropped because the content changes year-to-year or they are no longer/not regularly offered
drop_dict = {
            "10": "HIGH SCHOOL ALGEBRA",
            "492": "UNDERGRAD SEMINAR",
            "495": "SPECIAL TOPICS",
            "490": "INDEPENDENT STUDY",
            "106": "DISCOVERING MATH",
            "181": "LIFE SCI CALC&MDL I",
            "331": "TOPOLOGY"
            }

#Below is a dictionary of all the courses that remain after data cleaning
crse_dict = {
                '104': 'INTRO TO PROBABILTY',
                '105': 'INTRO TO MATH IDEA',
                '140': 'COLLEGE ALGEBRA',
                '143': 'PREP FOR CALCULUS',
                '145': 'APPLD TRIGONOMETR',
                '150': 'DISC MATH BUS&SOC S',
                '151': 'BUS & SOC SCI CALC',
                '160': 'SURVEY OF CALCULUS',
                '165': 'CALCULUS I',
                '166': 'CALCULUS II',
                '207': 'MATRCES&LINEAR ALGB',
                '265': 'CALCULUS III',
                '266': 'ELEM DIFF EQUATNS',
                '267': 'DIFF EQ & TRANSFMS',
                '101': 'ORIENTATION IN MATH',
                '201': 'INTRO TO PROOFS',
                '240': 'INVESTM&CREDIT MATH',
                '301': 'ABSTRACT ALGEBRA I',
                '302': 'ABSTRCT ALGEBRA II',
                '304': 'COMBINATORICS',
                '314': 'GRAPH THEORY',
                '317': 'THRY LINEAR ALGEBRA',
                '350': 'NUMBER THEORY',
                '341': 'INTR THY PROBAB&S I',
                '342': 'INTR THY PR&STAT II',
                '365': 'COMPLEX VARIABLES',
                '373': 'INTR SCIENTF CMPTNG',
                '385': 'INTR PART DIFF EQUA',
                '397': 'IV MATH TCH SECDY',
                '414': 'ANALYSIS I',
                '415': 'ANALYSIS II',
                '435': 'GEOMETRY I',
                '436': 'GEOMETRY II',
                '497': 'TEACH SEC SCHL MATH'
            }

In [183]:
#Next, we format the semester in which the student entered the university, took the course, and obtained a degree
#We will encode the semester as the last two digits of the year + 0.5 for fall and 0 for spring or summer
pairs = [['ENTRY_CCYY','ENTRY_SEM_CD'],
         ['SEM_CCYY','SEM_CD'],
         ['SEM_CCYY.1','SEM_CD.1']]

for ccyy, sem in pairs:
    #get the last two digits of the year
    df[ccyy] = df[ccyy] % 1000

    #encode the semesters as 0.5 for fall and 0 for spring or summer (value=1 in the dataset)
    df[sem] = df[sem].replace({'F': 0.5, '1': 0, 'S': 0}) 

    #combine the year and semester to get a unique identifier for each semester
    df[sem] = df[sem] + df[ccyy]

#Next, we create columns that compute the semester in which the course was taken and the semester in which the student graduated
#create a new column named 'CRSE_SEM' that computes the semester in which the particular course was taken
df['CRSE_SEM'] = (df['SEM_CD'] - df['ENTRY_SEM_CD'])*2 + 1

#create a new column named 'GRAD_CD' that computes the semester in which the student graduated
df['GRAD_SEM'] = (df['SEM_CD.1'] - df['ENTRY_SEM_CD'])*2 + 1

#get a list of students with more than one degree
degs_per_student = df.groupby('STUDENT')['GRAD_SEM'].nunique()
students_mult_degs = degs_per_student[degs_per_student > 1].index.tolist()

#Remove all rows corresponding to the later degrees
for student in students_mult_degs:
    #get the first degree
    first_degree = df[df['STUDENT'] == student]['GRAD_SEM'].min()

    #remove all rows corresponding to the later degrees
    df = df.drop(df[(df['STUDENT'] == student) & (df['GRAD_SEM'] > first_degree)].index)

print('After removing second degrees')
gradestats(df)

#remove all rows corresponding to students that gradated in less than 4 semesters or greater than 12 semesters (don't touch the rows with null values)
df = df.drop(df[(df['GRAD_SEM'] < 4) | (df['GRAD_SEM'] > 12)].index)

print('After filtering for students that graduated in 4 to 12 semesters')
gradestats(df)

#drop all rows for which CRSE_SEM is greater than 8 or greater than GRAD_SEM (if this occurs, it means that the student took the course after they graduated; this scenario likely happened because the student continued taking courses for a second degree)
df = df.drop(df[(df['CRSE_SEM'] > 8) | (df['CRSE_SEM'] > df['GRAD_SEM'])].index)

print('After removing courses taken after graduation or after 8 semesters')
gradestats(df)

#Remove courses with low enrollment, or having course number less than 100
crse_counts = df['CRSE'].value_counts()
min_enrollment = 50
to_remove = crse_counts[crse_counts <= min_enrollment].index.to_list() + list(drop_dict.keys()) + ['25', '30']
df = df[~df.CRSE.isin(to_remove)]

print('After removing courses with low enrollment or numbered < 100')
gradestats(df)

After removing second degrees
Rows: 21772
Total: 9860
Grads: 7240
Percent: 73.43

After filtering for students that graduated in 4 to 12 semesters
Rows: 21321
Total: 9671
Grads: 7051
Percent: 72.91

After removing courses taken after graduation or after 8 semesters
Rows: 20676
Total: 9545
Grads: 6961
Percent: 72.93

After removing courses with low enrollment or numbered < 100
Rows: 19205
Total: 9179
Grads: 6717
Percent: 73.18



In [184]:
#check that the keys of crse_dict are exactly the same as the unique values of CRSE
courses = list(df['CRSE'].unique())
assert set(crse_dict.keys()) == set(courses)

In [185]:
#Next, we format the 'GRADE_CATEGORY' columns as follows:
#"Above C-" and "Satisfactory/Pass" are replaced with a 1 and "C- or Below" is replaced with a -1
# df['GRADE_CATGORY'] = df['GRADE_CATGORY'].replace({'C- OR BELOW': -1, 
#                                                            'ABOVE C-': 1, 
#                                                            'Satisfactory/Pass': 1})

df['GRADE_CATGORY'].replace({'C- OR BELOW': 0,
                                    'ABOVE C-': 1,
                                    'Satisfactory/Pass': 1}, inplace=True)

#We add one column for each course and temporarily stick a 0 in each column
courses.sort()
for x in courses:
    df[x] = 0 

#Now, in each row, if a student has taken a course x, then the value in the column 'x' is replaced with the product of the 'GRADE_CATEGORY' and 'CRSE_SEM' (so it ranges from 1 to 8)
for x in courses:
    df[x][df['CRSE'] == x] = df['GRADE_CATGORY'] * df['CRSE_SEM']

In [186]:
#drop columns that are not needed
df = df.drop(['LEVEL', 'OFFER_DEPT_ABRVN', 'MAJOR_COLL', 'DEG_DESCR', 'DEG_DATEDB', 'Workday Enrolled in Fall 2024', 'Workday Enrolled in Fall 2024 Class Standing', 'Workday Enrolled in Fall 2024 Primary Program of Study', 'MAJOR_CURR', 'MAJOR_CURR.2', 'MAJOR_CURR.1', 'CLSFN_YR.1', 'LAST_RGST_TERM', 'SECT', 'CLSFN_YR', 'MAJOR_COLL.1','CRSE', 'GRADE_CATGORY', 'SEM_CCYY', 'SEM_CD', 'SEM_CCYY.1', 'SEM_CD.1','CRSE_TITLE', 'ENTRY_CCYY', 'ENTRY_SEM_CD','DEG_CD',
 'CRSE_SEM'], axis='columns')

final_cols = df.columns.to_list()
print(len(final_cols))
final_cols

36


['STUDENT',
 'GRAD_SEM',
 '101',
 '104',
 '105',
 '140',
 '143',
 '145',
 '150',
 '151',
 '160',
 '165',
 '166',
 '201',
 '207',
 '240',
 '265',
 '266',
 '267',
 '301',
 '302',
 '304',
 '314',
 '317',
 '341',
 '342',
 '350',
 '365',
 '373',
 '385',
 '397',
 '414',
 '415',
 '435',
 '436',
 '497']

In [187]:
#replace all null values with 0
df = df.fillna(0)

#group by student, choose the max value in each column
df = df.groupby('STUDENT').max()

#Add a column 'y' whose value is 0 if 'GRAD_SEM' is 0 or greater than 8, and 1 otherwise
df['y'] = 0
df['y'][(df['GRAD_SEM'] > 0) & (df['GRAD_SEM'] <= 8)] = 1

df

Unnamed: 0_level_0,GRAD_SEM,101,104,105,140,143,145,150,151,160,165,166,201,207,240,265,266,267,301,302,304,314,317,341,342,350,365,373,385,397,414,415,435,436,497,y
STUDENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8.0,0,0,0,0,0,0,0,0,0,1,2,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,0.0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,7.0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13059,4.0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
13060,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13061,0.0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13062,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [188]:
gen_prereqs = {'143':['140'],
               '160':['145'],
               '165':['143','140'],
               '166':['165','143','140'],
               '207':['165','143','140'],
               '265':['166','165','143','140'],
               '266':['265','166','165','143','140'],
               '267':['265','166','165','143','140'],
               }

math_prereqs = {'304':['201'],
                '314':['201'],
                '317':['201'],
                '350':['201'],
                '414':['201'],
                '435':['201'],
                '436':['201'],
                '301':['317','201'],
                # '407':['317','201'],
                '415':['414','201'],
                '302':['301','317','201'],
                '397':['301','317','201'],

}
prereqs = {**gen_prereqs, **math_prereqs}
prereqs


{'143': ['140'],
 '160': ['145'],
 '165': ['143', '140'],
 '166': ['165', '143', '140'],
 '207': ['165', '143', '140'],
 '265': ['166', '165', '143', '140'],
 '266': ['265', '166', '165', '143', '140'],
 '267': ['265', '166', '165', '143', '140'],
 '304': ['201'],
 '314': ['201'],
 '317': ['201'],
 '350': ['201'],
 '414': ['201'],
 '435': ['201'],
 '436': ['201'],
 '301': ['317', '201'],
 '415': ['414', '201'],
 '302': ['301', '317', '201'],
 '397': ['301', '317', '201']}

In [189]:
#We want to examine whether there is any strangeness in the data (namely, if a student has taken the prerequisite course after the course that requires the prerequisite)

#loop over x in prereqs.keys()
#then, loop over each prereq in prereqs[x]
#get the indices of all students for which df[x] > 0 and df[x] < df[prereq]. 
#If the length of the list of indices is greater than 0, then add the student to a list of students who took the prereq after the course
#store the list of students in a dictionary with the key as the tuple (x, prereq)

prereq_snafu = {}
for x in prereqs.keys():
    for prereq in prereqs[x]:
        snafus = df[(df[x] > 0) & (df[x] < df[prereq])].index.to_list()
        if len(snafus) > 0:
            prereq_snafu[(x, prereq)] = snafus

#convert prereq_snafu to a dataframe where the key is the course pair and the value is the list of students who took the prereq after the course.
prereq_snafu_df = pd.DataFrame(prereq_snafu.items(), columns=['course,prereq', 'students'])
#add a column for the number of students who took the prereq after the course
prereq_snafu_df['count'] = prereq_snafu_df['students'].apply(len)

In [190]:
prereq_snafu_df

Unnamed: 0,"course,prereq",students,count
0,"(143, 140)","[586, 5188, 5512, 8116, 10651, 11624]",6
1,"(165, 143)","[1760, 4296, 9597, 12865]",4
2,"(165, 140)","[193, 389, 1008, 2921, 3195, 8286, 9561, 10581...",10
3,"(166, 165)","[926, 1403, 2001, 3680, 3698, 4477, 5355, 7701...",9
4,"(166, 143)","[1760, 3177, 4296, 10674, 12865]",5
5,"(166, 140)","[389, 3177, 10581, 10674]",4
6,"(207, 165)",[10721],1
7,"(207, 143)",[4296],1
8,"(207, 140)",[389],1
9,"(265, 166)","[3031, 3786, 5318, 6418, 11290]",5


In [192]:
snafu_students = prereq_snafu_df['students'].sum()
snafu_students.sort()
# snafu_students = list(set(snafu_students))
print(len(snafu_students))
snafu_students

161


[86,
 193,
 193,
 193,
 389,
 389,
 389,
 586,
 629,
 723,
 926,
 1008,
 1107,
 1107,
 1107,
 1107,
 1403,
 1428,
 1470,
 1532,
 1759,
 1759,
 1759,
 1760,
 1760,
 1760,
 1794,
 1824,
 2001,
 2001,
 2399,
 2574,
 2574,
 2574,
 2652,
 2814,
 2921,
 2951,
 2951,
 2951,
 3031,
 3031,
 3177,
 3177,
 3177,
 3195,
 3283,
 3382,
 3583,
 3583,
 3588,
 3680,
 3698,
 3698,
 3786,
 3787,
 3836,
 4086,
 4125,
 4296,
 4296,
 4296,
 4384,
 4384,
 4477,
 4477,
 4749,
 4775,
 5087,
 5128,
 5128,
 5153,
 5188,
 5255,
 5318,
 5318,
 5355,
 5355,
 5512,
 5587,
 6343,
 6418,
 6419,
 6560,
 6935,
 7009,
 7025,
 7025,
 7357,
 7357,
 7670,
 7701,
 7701,
 7819,
 7827,
 7969,
 8092,
 8108,
 8108,
 8108,
 8108,
 8108,
 8108,
 8116,
 8278,
 8286,
 8313,
 8313,
 8313,
 8356,
 8494,
 8518,
 8518,
 8529,
 8692,
 8878,
 9080,
 9318,
 9385,
 9446,
 9451,
 9561,
 9597,
 9644,
 9686,
 9686,
 9686,
 9686,
 9736,
 9885,
 9909,
 10289,
 10581,
 10581,
 10581,
 10651,
 10674,
 10674,
 10674,
 10721,
 11043,
 11043,
 11143,

In [199]:
snafu_students = list(set(snafu_students))
snafu_students.sort()
df.loc[snafu_students]

Unnamed: 0_level_0,GRAD_SEM,101,104,105,140,143,145,150,151,160,165,166,201,207,240,265,266,267,301,302,304,314,317,341,342,350,365,373,385,397,414,415,435,436,497,y
STUDENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
86,4.0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,3,0,0,4,1,2,0,0,0,0,1,4,4,0,3,2,0,1
193,8.0,5,0,0,2,0,0,0,0,0,1,2,6,0,0,3,0,4,6,8,0,0,5,6,0,0,0,7,0,0,5,0,7,0,0,1
389,0.0,0,0,0,7,0,0,1,0,0,2,3,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
586,8.0,0,0,0,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
629,10.0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
723,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,5,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
926,9.0,3,0,0,0,0,0,0,0,0,3,2,0,0,0,3,0,6,0,0,0,8,0,0,0,0,0,0,0,0,8,0,0,0,0,0
1008,8.0,0,0,0,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1107,8.0,0,0,0,0,0,0,0,0,0,1,2,7,5,0,3,4,0,6,0,7,6,6,0,0,0,0,0,0,0,6,8,0,0,0,1
1403,9.0,0,0,0,0,0,0,0,0,0,5,2,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [200]:
len(df.loc[snafu_students])

112

In [147]:
temp = pd.read_csv('data_set_10_17.csv')
temp = temp[temp['STUDENT'].isin(snafu_students)]
temp = temp[['STUDENT', 'CRSE', 'SEM_CCYY', 'SEM_CD', 'ENTRY_CCYY','ENTRY_SEM_CD','SEM_CCYY.1','SEM_CD.1','GRADE_CATGORY']]
temp

Unnamed: 0,STUDENT,CRSE,SEM_CCYY,SEM_CD,ENTRY_CCYY,ENTRY_SEM_CD,SEM_CCYY.1,SEM_CD.1,GRADE_CATGORY
173,86,101,2019,F,2019,F,2021.0,S,Satisfactory/Pass
174,86,101,2019,F,2019,F,2022.0,S,Satisfactory/Pass
175,86,201,2019,F,2019,F,2021.0,S,C- OR BELOW
176,86,201,2019,F,2019,F,2022.0,S,C- OR BELOW
177,86,317,2019,F,2019,F,2021.0,S,ABOVE C-
...,...,...,...,...,...,...,...,...,...
30171,12980,317,2017,S,2015,F,2019.0,S,C- OR BELOW
30172,12980,301,2018,F,2015,F,2019.0,S,ABOVE C-
30173,12980,397,2018,S,2015,F,2019.0,S,ABOVE C-
30174,12980,436,2018,S,2015,F,2019.0,S,ABOVE C-


In [151]:
temp.reset_index(drop=True, inplace=True)
temp.head(40)

Unnamed: 0,STUDENT,CRSE,SEM_CCYY,SEM_CD,ENTRY_CCYY,ENTRY_SEM_CD,SEM_CCYY.1,SEM_CD.1,GRADE_CATGORY
0,86,101,2019,F,2019,F,2021.0,S,Satisfactory/Pass
1,86,101,2019,F,2019,F,2022.0,S,Satisfactory/Pass
2,86,201,2019,F,2019,F,2021.0,S,C- OR BELOW
3,86,201,2019,F,2019,F,2022.0,S,C- OR BELOW
4,86,317,2019,F,2019,F,2021.0,S,ABOVE C-
5,86,317,2019,F,2019,F,2022.0,S,ABOVE C-
6,86,385,2019,F,2019,F,2021.0,S,ABOVE C-
7,86,385,2019,F,2019,F,2022.0,S,ABOVE C-
8,86,301,2020,F,2019,F,2021.0,S,ABOVE C-
9,86,301,2020,F,2019,F,2022.0,S,ABOVE C-


In [None]:
check = temp[temp['STUDENT'].isin(prereq_snafu[('317', '201')])]


Unnamed: 0,STUDENT,CRSE,SEM_CCYY,SEM_CD,ENTRY_CCYY,ENTRY_SEM_CD,SEM_CCYY.1,SEM_CD.1,GRADE_CATGORY
0,86,101,2019,F,2019,F,2021.0,S,Satisfactory/Pass
1,86,101,2019,F,2019,F,2022.0,S,Satisfactory/Pass
2,86,201,2019,F,2019,F,2021.0,S,C- OR BELOW
3,86,201,2019,F,2019,F,2022.0,S,C- OR BELOW
4,86,317,2019,F,2019,F,2021.0,S,ABOVE C-
5,86,317,2019,F,2019,F,2022.0,S,ABOVE C-
6,86,385,2019,F,2019,F,2021.0,S,ABOVE C-
7,86,385,2019,F,2019,F,2022.0,S,ABOVE C-
8,86,301,2020,F,2019,F,2021.0,S,ABOVE C-
9,86,301,2020,F,2019,F,2022.0,S,ABOVE C-


In [148]:
temp.head(20)

Unnamed: 0,STUDENT,CRSE,SEM_CCYY,SEM_CD,ENTRY_CCYY,ENTRY_SEM_CD,SEM_CCYY.1,SEM_CD.1,GRADE_CATGORY
173,86,101,2019,F,2019,F,2021.0,S,Satisfactory/Pass
174,86,101,2019,F,2019,F,2022.0,S,Satisfactory/Pass
175,86,201,2019,F,2019,F,2021.0,S,C- OR BELOW
176,86,201,2019,F,2019,F,2022.0,S,C- OR BELOW
177,86,317,2019,F,2019,F,2021.0,S,ABOVE C-
178,86,317,2019,F,2019,F,2022.0,S,ABOVE C-
179,86,385,2019,F,2019,F,2021.0,S,ABOVE C-
180,86,385,2019,F,2019,F,2022.0,S,ABOVE C-
181,86,301,2020,F,2019,F,2021.0,S,ABOVE C-
182,86,301,2020,F,2019,F,2022.0,S,ABOVE C-


In [126]:
df

Unnamed: 0,STUDENT,GRAD_SEM,101,104,105,140,143,145,150,151,160,165,166,201,207,240,265,266,267,301,302,304,314,317,341,342,350,365,373,385,397,414,415,435,436,497
0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,3,,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,4,8.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,4,8.0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,4,8.0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30373,13060,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30374,13061,,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30375,13062,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30377,13063,9.0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [109]:
prereq_snafu_df

Unnamed: 0,"course,prereq",students,count


TypeError: agg function failed [how->max,dtype->object]

In [None]:
#save the grouped dataset
df.to_csv('final_dataset_Nov_9.csv')