In [1]:
# Import data handling libraries
import pandas as pd
import numpy as np

In [2]:
# Read in the latest raw dataset
df = pd.read_csv('data_set_11_17.csv')

In [3]:
# We will begin cleaning by removing unneeded columns
column_remove =['MAJOR_COLL', 'CLSFN_YR', 'LEVEL', 'OFFER_DEPT_ABRVN', 'SECT', 'CRSE_TITLE', 'DEG_DATEDB', 'DEG_DESCR', 
                'LAST_RGST_TERM','CLSFN_YR.1', 'MAJOR_COLL.1', 'Workday Enrolled in Fall 2024', 
                'Workday Enrolled in Fall 2024 Class Standing', 'Workday Enrolled in Fall 2024 Primary Program of Study']
df = df.drop(columns = column_remove)

In [4]:
# Next task will be to recode how grades are reported, which will be done as follows:
# C- or below = -1
# Above C- = Satisfactory/Pass = 1
grade_recode = {'C- OR BELOW':-1,'ABOVE C-':1,'Satisfactory/Pass':1}
df['GRADE_CATGORY'] = df['GRADE_CATGORY'].map(pd.Series(grade_recode))

In [5]:
# Next task will be to recode how years/semesters are repoted in the data
# By a ''full'' semester we mean a 16 week semester - these are Fall/Spring at Iowa State
# The Summer session is 8 weeks, so it will count as a ``half'' semester
# Thus each year has a total of 2.5 semesters
# The graduation rate at Iowa State is based on whether or not a student recieves a degree within 6 years
# Thus we consider a graduation ``successful'' if it is completed within (2.5)*6 = 15 semesters of enrolling
# Since our data for courses begins in Spring 2014, we will only consider those students the enrolled beginning with Summer 2008 
# Those enrolled in Summer 2008 were present for half a semester, so we want to Summer 2008 to have the value 0.5 in the data
# Following that we would then want Fall 2008 to have value 1.5 in the data as it is a full semester after Summer 2008
# Similarly Spring 2009 should have value 2.5, Summer 2009 should have 3, Fall 2009 should be 4, and so on.
# Thus we will recode semesters as follows:
# Spring = 0, Summer = 0.5, Fall = 1.5
# And then compute (2.5)*[YEAR - 2008] + SEMESTER to obtain our desired values for enrollment, when a course was taken, and graduation (if present)

sem_list = ['ENTRY_SEM_CD', 'SEM_CD', 'SEM_CD.1']
sem_recode = {'S': 0, '1': 0.5, 'F': 1.5}
for x in sem_list:
    df[x] = df[x].map(pd.Series(sem_recode))

df['ENT_SEM'] = (5/2)*(df['ENTRY_CCYY'] - 2008) + df['ENTRY_SEM_CD']
df['CRSE_SEM'] = (5/2)*(df['SEM_CCYY'] - 2008) + df['SEM_CD']
df['GRAD_SEM'] = (5/2)*(df['SEM_CCYY.1'] - 2008) + df['SEM_CD.1']

In [6]:
# We can now remove the year and semester columns 
year_sem_remove = ['ENTRY_CCYY','ENTRY_SEM_CD','SEM_CCYY','SEM_CD','SEM_CCYY.1','SEM_CD.1']
df = df.drop(columns = year_sem_remove)

In [7]:
# Next, we will get the initial number of students and number of rows
print(df["STUDENT"].nunique())
print(len(df))

13065
30392


In [8]:
# Remove any graduate degress
degree_remove = ['MS', 'MED', 'MAT', 'MEN', 'MFN', 'MBS', 'PHD', 'MFS', 'MHC', 'MAA', 'MBA', 'MFA', 'MA', 'MTR', 'MHO', 'DVM', 'MCP','MRE']
for x in degree_remove:
    df = df.drop(df[df['DEG_CD'] == x].index)

In [9]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

13052
29150


In [10]:
# Remove the degree column as we no longer need it
df = df.drop('DEG_CD',axis=1)

In [11]:
# Remove students that enrolled before Summer 2008
df = df[ df['ENT_SEM'] >= 0.5] 

In [12]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

13000
28990


In [13]:
# Our data for courses ends Summer 2024, so the only students in the data who are present for at least 15 semesters
# are those who enrolled strictly before Spring 2019 (which is (2.5)(2019-2008) + 0 = 27.5 in our data set) . 
# We will remove students who enrolled at least in Spring 2019 and did not graduate.
df = df.drop( df[ (df['ENT_SEM'] >= 27.5) & (df['GRAD_SEM'].isnull())].index)

In [14]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

9422
22734


In [15]:
# Next we will remove students that graduated before Spring 2014 (which is (2.5)(2014-2008) + 0 = 15) 
# as our course data begins this semester and thus any graduation that occurs before this is not associated with our data
df = df[(df['GRAD_SEM'] >= 15) | (df['GRAD_SEM'].isnull())]

In [16]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

9413
22669


In [17]:
# Remove courses taken after graduating
df = df[  ( df['CRSE_SEM'] <= df['GRAD_SEM']  ) |  (df['GRAD_SEM'].isnull())  ] 

In [18]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

9393
22565


In [19]:
# Sanity check - make sure courses are taken after enrollment and enrollment occurs before graduation
df = df[  ( df['ENT_SEM'] <= df['CRSE_SEM']  )  & (  (df['ENT_SEM'] <= df['GRAD_SEM']) |   (df['GRAD_SEM'].isnull())   )  ]

In [20]:
# Number of students/rows check - nothing should change
print(df["STUDENT"].nunique())
print(len(df))

9393
22565


In [21]:
# Next we want to remove courses taken above the ``at least 15 semesters attended'' window 
# First we update 'CRSE_SEM' to be relative to 'ENT_SEM' -
# Note the + 0.5 is need since our starting point Summer 2008 has value 0.5
df['CRSE_SEM'] = (  df['CRSE_SEM'] - df['ENT_SEM'] ) + 0.5

# Remove courses taken after at least 15 semesters attended
df = df[ df['CRSE_SEM'] <= 15   ]

In [22]:
# Number of students/rows check - nothing should change
print(df["STUDENT"].nunique())
print(len(df))

9350
22366


In [23]:
# Next we add a column to indicate graduation 
df['GRAD_SEM'] = (df['GRAD_SEM'] - df['ENT_SEM']) + 0.5
df['GRADUATE'] = np.where(df['GRAD_SEM'] <= 15, 1, 0)

In [24]:
# We can now remove the 'ENT_SEM' and 'GRAD_SEM' columns
sem_remove = ['ENT_SEM','GRAD_SEM']
df = df.drop(columns = sem_remove)

In [25]:
# Next goal is create a column for each students 'first' major and to remove the repeated information due to students with multiple degrees
# First, create a column that indicates the students with multiple degrees
df['NUM_DEG'] = df.groupby('STUDENT')['MAJOR_CURR.1'].transform('nunique')

# Second, we flag those students where MAJOR_CURR.2 matches some entry in MAJOR_CURR.1
df['DEG_FLAG'] = df.groupby('STUDENT',group_keys=False).apply(lambda x: x['MAJOR_CURR.2'].isin(x['MAJOR_CURR.1']),include_groups=False)

# Next, we will create a column that either lists the `first' major of the student
# or has a 0 if the student has multiple degrees and the row is information regarding a second/third/etc.

def major_detect(row):
    if (row['NUM_DEG'] == 0):
        val = row['MAJOR_CURR.2']
    elif (row['NUM_DEG'] == 1):
        val = row['MAJOR_CURR.1']
    elif (row['MAJOR_CURR.2'] == row['MAJOR_CURR.1']):
        val = row['MAJOR_CURR.1']
    elif (row['DEG_FLAG'] == False)  & (row['MAJOR_CURR'] == row['MAJOR_CURR.1']):
        val = row['MAJOR_CURR.1']
    else:
        val = 0
    return val

df['MAJOR'] = df.apply(major_detect,axis=1)

# Those with a 0 in 'MAJOR' are safe for removal
df = df.drop(df[df['MAJOR'] == 0].index)

In [26]:
# Number of students/rows check - note number of students should not have changed
print(df["STUDENT"].nunique())
print(len(df))

9350
20964


In [27]:
# Remove the columns MAJOR_CURR, MAJOR_CURR.1, MAJOR_CURR.2, NUM_DEG, and DEG_FLAG as they are no longer needed
deg_info_remove = ['MAJOR_CURR','MAJOR_CURR.1','MAJOR_CURR.2','NUM_DEG','DEG_FLAG']
df = df.drop(columns = deg_info_remove)

In [28]:
# Remove courses that are not in the current catalog, have variable content, are not `content courses',
# are not math department courses, are supplemental courses, and are graduate courses.
course_remove = ['25', '30', 
                 '101', '106', '106X', '139X', '181', '182', '195', '196', 
                 '202', '202X', '241X', '268', '290', '297', 
                 '331', '341', '342', '392X', '397', '398', 
                 '408X', '421', '439', '474', '474X', '490', '490H', '491', '492', '495', '495X', '497', '498', 
                 '501', '502', '503', '503X', '504', '505', '506X', '507', '510', '511', '515',  '516', '518X', 
                 '519', '520', '525', '533', '535', '545', '554', '561', '562', '565', '566', '567', '581', 
                 '601', '603X', '605', '608', '610', '617', '618', '619X', '624', '631', '633', '642', '656', '666']
for x in course_remove:
    df = df.drop(df[df['CRSE'] == x].index)

In [29]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

9018
18684


In [30]:
# Remove X (experimental) and H (honors section)
df['CRSE'] = df['CRSE'].str.replace('X', '')
df['CRSE'] = df['CRSE'].str.replace('H', '')

In [31]:
# Relabel 403 as 302 (abstract algebra 2) and 142 as 145 (trigonmetry)
df['CRSE'] = df['CRSE'].replace('403', '302')
df['CRSE'] = df['CRSE'].replace('142', '145')

In [32]:
# Next we will elimate repeated attempts and keep the most recent (successful) attempt
# First we update 'GRADE_CATGORY' to be 'GRADE_CATGORY'*'CRSE_SEM' 
# Thus it will range from -15 to 15, incremented by .5
df['GRADE_CATGORY'] = df['GRADE_CATGORY'] * df['CRSE_SEM']

# Second we assign a rank to the grades from 1-29, so that -0.5 gets least prioerity and 15 gets highest
df['GRADE_RANK'] = np.where( df['GRADE_CATGORY'] < 0, abs(df['GRADE_CATGORY']), df['GRADE_CATGORY'] + 15  )

# Third, we detect if there are any repeated attempts at a course and rank them
df['CRSE_RANK'] = df.groupby(['STUDENT','CRSE'])['GRADE_RANK'].rank(method='first',ascending=False)

# # Those with a value > 1 in 'CRSE_RANK' are safe for removal
df = df.drop(df[df['CRSE_RANK'] > 1].index)

In [33]:
# Number of students/rows check - number of students should not have changed
print(df["STUDENT"].nunique())
print(len(df))

9018
18075


In [34]:
# Check that there are no more duplicated rows
dup_check = df[['STUDENT','CRSE']]
dup_check.duplicated().sum()

np.int64(0)

In [35]:
# Remove the CRSE_SEM, GRADE_RANK, CRSE_RANK columns as they are no longer needed
rank_remove = ['CRSE_SEM','GRADE_RANK','CRSE_RANK']
for x in rank_remove:
    df = df.drop(x,axis=1)

In [36]:
# Final cleaning task is to remove courses with ``low'' enrollment
# We will determine this by getting a count of the number of times a course appears
# Put it in decesending order, then divide the cumsum by the sum
crse_value = df['CRSE'].value_counts()
crse_value.cumsum()/crse_value.sum()

CRSE
165    0.134108
166    0.249959
265    0.337815
143    0.413721
140    0.485588
104    0.552476
150    0.611286
207    0.659806
201    0.698976
317    0.735657
267    0.770899
105    0.799834
414    0.826445
301    0.852559
266    0.875242
160    0.894827
385    0.907386
435    0.919281
436    0.930124
314    0.939917
304    0.947441
373    0.954689
350    0.961715
151    0.968022
415    0.973942
365    0.979751
302    0.984509
145    0.989046
240    0.992365
481    0.994855
407    0.996846
424    0.998230
441    0.998838
442    0.999336
422    0.999613
423    0.999834
469    1.000000
Name: count, dtype: float64

In [37]:
# In light of the previous cell, we will cut out every course above 145 because at that point we have accounted for 99% of the data
final_cut = ['240','481','407', '424', '441', '442', '422', '423', '469']
for x in final_cut:
    df = df.drop(df[df['CRSE'] == x].index)

In [38]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

9014
17877


In [39]:
# We add one column for each course and enter either a 0 (if course is not taken) or 
# the value of round('GRADE_CATGORY' * 'CRSE_SEM') (this will range from -15 to 15)
courses = df['CRSE'].unique().tolist()
courses.sort()
for x in courses:
    df[x] = np.where(df['CRSE'] == x, df['GRADE_CATGORY'], 0) 

In [40]:
# Remove the CRSE and GRADE_CATGORY, columns as they are no longer needed
crse_info_remove = ['CRSE','GRADE_CATGORY']
for x in crse_info_remove:
    df = df.drop(x,axis=1)

In [41]:
# We now group by student, keeping the first row of any columns with str values and
# choosing the maximum value (with respect to absolute value) for columns with numeric values 

def custom_agg(group):
    result = {}
    for col in group.columns:
        if group[col].dtype == 'object':
            result[col] = group[col].iloc[0] 
        else:
            result[col] = max( group[col], key = abs)
    return pd.Series(result)

df = df.groupby('STUDENT').apply(custom_agg,include_groups=False)

In [42]:
# Final step - recode the 'MAJOR' column so its readable
# First get list of majors

maj_recode = {'A E' : 'Agricultural Engineering',
 'A ECL' : 'Animal Ecology',
 'A M D' : 'Apparel, Merchandising, and Design',
 'A TR': 'Athletic Training',
 'ACCT' : 'Accounting',
 'ACSCI' : 'Actuarial Science',
 'ADVRT' : 'Advertising',
 'AER E': 'Aerospace Engineering',
 'AG B' : 'Agricultural Business',
 'AG ST' : 'Agricultural Studies',
 'AGBIO' : 'Agricultural Biochemistry',
 'AGLSE' : 'Agricultural and Life Sciences Education',
 'AGRON' : 'Agronomy',
 'AGRPS' : 'Agricultural and Rural Policy Studies',
 'AN S' : 'Animal Science',
 'ANTHR' : 'Anthropology',
 'ARC' : 'Architecture',
 'ARTDN' : 'Art and Design',
 'ARTGR' : 'Graphic Design',
 'ARTID' : 'Interior Design',
 'ARTIS' : 'Integrated Studio Arts',
 'AST' : 'Agricultural Systems Technology',
 'BBA' : 'Business Administration',
 'BCBIO' : 'Bioinformatics and Computational Biology',
 'BIOCA' : 'Biochemistry',
 'BIOCH' : 'Biochemistry',
 'BIOL' : 'Biology',
 'BIOLA' : 'Biology',
 'BIOPH' : 'Biophysics',
 'BPM I' : 'Biological/Medical Illustration',
 'BSE' : 'Biological Systems Engineering',
 'BU AN' : 'Business Analytics',
 'BUS U' : 'Business',
 'BUSEC' : 'Business Economics',
 'C E' : 'Civil Engineering',
 'C J' : 'Criminal Justice',
 'C R P' : 'Community & Regional Planning',
 'CH E' : 'Chemical Engineering',
 'CH FS' : 'Child, Adult and Family Services',
 'CHEM' : 'Chemisty',
 'CJ ST' : 'Criminal Justice Studies',
 'CLSCI' : 'Climate Science',
 'COM S' : 'Computer Science',
 'COMST' : 'Comunication Studies',
 'CON E' : 'Construction Engineering',
 'CPR E' : 'Computer Engineering',
 'CS H' : 'Computer Science',
 'CUFSA' : 'Culinary Food Science',
 'CUFSH' : 'Culinary Food Science',
 'CYB E' : 'Cyber Security Engineering',
 'CYBSC' : 'Cyber Security',
 'DIETA' : 'Dietetics',
 'DIETH' : 'Dietetics',
 'DS' : 'Data Science',
 'DSGN' : 'Undeclared',
 'E CPE' : 'Electrical and Computer Engineering',
 'E E' : 'Electrical Engineering',
 'EA SC' : 'Earth Science',
 'ECE' : 'Early Childhood Education',
 'ECON' : 'Economics',
 'EL ED' : 'Elementary Education',
 'ENGL' : 'English',
 'ENGR' : 'Engineering',
 'ENGRS' : 'Engineering',
 'ENSCA' : 'Environmental Science',
 'ENSCS' : 'Environmental Science',
 'ENTSP' : 'Entrepreneurship',
 'EVENT' : 'Event Management',
 'F C P' : 'Financial Counseling and Planning',
 'FCEDS' : 'Family and Consumer Sciences',
 'FIN' : 'Finance',
 'FOR' : 'Forestry',
 'FS A' : 'Food Science',
 'FS H' : 'Food Science',
 'GE AT' : 'Geological and Atmospheric Sciences',
 'GEN' : 'Genetics',
 'GEN S' : 'Genetics',
 'GENPV' : 'General Preveterinary Medicine',
 'GEOL' : 'Geology',
 'GLOBE' : 'Global Resource Systems',
 'HD FS' : 'Human Development and Family Studies',
 'HIST' : 'History',
 'HORT' : 'Horticulture',
 'HRM' : 'Human Resource Management',
 'HSP M' : 'Hospitality Management',
 'I DES' : 'Interdisciplinary Design',
 'I E' : 'Industrial Engineering',
 'I TEC' : 'Industrial Technology',
 'IND D' : 'Industrial Design',
 'INDIS' : 'Interdisciplinary Studies',
 'ISBA' : 'Information Systems and Business Analytics',
 'JL MC' : 'Journalism and Mass Communication',
 'KIN H' : 'Kinesiology and Health',
 'L A' : 'Landscape Architecture',
 'L ST' : 'Liberal Arts and Sciences',
 'LAS' : 'Liberal Arts and Sciences',
 'LAS S' : 'Liberal Arts and Sciences',
 'LING' : 'Linguistics',
 'M E' : 'Mechanical Engineering',
 'MAT E' : 'Materials Engineering',
 'MATH' : 'Mathematics',
 'MGMT' : 'Management',
 'MICR' : 'Microbiology',
 'MIS' : 'Management Information Systems',
 'MKT' : 'Marketing',
 'MTEOR' : 'Meteorology',
 'MU BA' : 'Music',
 'MU BM' : 'Music',
 'NRS A' : 'Nursing',
 'NRS H' : 'Nursing',
 'NS A' : 'Nutritional Science',
 'NS H' : 'Nutritional Science',
 'OPEN' : 'Open Option',
 'P ARC' : 'Architecture',
 'P ATR' : 'Athletic Training',
 'P BUS' : 'Business',
 'P CS' : 'Computer Science',
 'P DTH' : 'Dietetics',
 'P GR' : 'Graphic Design',
 'P H P' : 'Preprofessional Health Programs',
 'P ID' : 'Interior Design',
 'P IND' : 'Industrial Design',
 'P LAW' : 'Preparation for Law',
 'P LST' : 'Liberal Studies',
 'P MED' : 'Preparation for Human Medicine',
 'P R' : 'Public Relations',
 'PBPMI' : 'Biological/Medical Illustration',
 'PERF' : 'Performing Arts',
 'PHIL' : 'Philosophy',
 'PHYS' : 'Physics',
 'POL S' : 'Political Science',
 'PSYCH' : 'Psychology',
 'RELIG' : 'Religious Studies',
 'S E' : 'Systems Engineering',
 'SCM' : 'Supply Chain Management',
 'SOC' : 'Sociology',
 'SP CM' : 'Speech Communication',
 'STAT' : 'Statistics',
 'TCOMM' : 'Technical Communication',
 'UNDEC' : 'Undeclared',
 'V M' : 'Veterinary Medicine',
 'W S' : 'Women’s and Gender Studies',
 'WGS' : 'Women’s and Gender Studies',
 'WLC' : 'World Languages and Cultures'
}
df['MAJOR'] = df['MAJOR'].map(pd.Series(maj_recode))

In [43]:
# Save full set to csv
df.to_csv('full_dataset.csv')

In [44]:
# Next we create three more dataframes
# First will only have `general population' courses
# Second will only have the `math major' courses
# Third will only have math majors
gen_math_crse = ['104', '105', '140', '143', '145', '150', '151', '160', '165', '166', '207', '265', '266', '267']
math_maj_crse = ['201', '301', '302', '304', '314', '317', '350', '365', '373', '385', '414', '415', '435', '436']

df_gen_crse = df.drop(columns = math_maj_crse)
df_maj_crse = df.drop(columns = gen_math_crse)
df_math = df[df['MAJOR'] == 'Mathematics']

In [45]:
# We will remove the students without any grade information from df_gen_crse
df_gen_crse['FLAG'] = df[gen_math_crse].abs().sum(axis=1)
df_gen_crse = df_gen_crse.drop(df_gen_crse[df_gen_crse['FLAG'] == 0].index)
df_gen_crse.drop('FLAG',axis=1,inplace=True)

# Same for df_maj_crse
df_maj_crse['FLAG'] = df[math_maj_crse].abs().sum(axis=1)
df_maj_crse = df_maj_crse.drop(df_maj_crse[df_maj_crse['FLAG'] == 0].index)
df_maj_crse.drop('FLAG',axis=1,inplace=True)

In [46]:
# Save df_gen_crse and df_maj_crse
df_gen_crse.to_csv('gen_dataset.csv')
df_maj_crse.to_csv('maj_dataset.csv')

In [47]:
# From df_math remove courses that are not requirements for the major and the major column as it is not needed
df_math = df_math.drop(columns = ['MAJOR','104', '105', '140', '143', '145', '150', '151', '160', '207'])

#Save df_math
df_math.to_csv('math_dataset.csv')