In [1]:
# Import the usual
import pandas as pd
import numpy as np

In [2]:
# Read in the new raw dataset
df = pd.read_csv('data_set_11_15.csv')

In [3]:
# Get list of columns
# df.columns.tolist()

In [4]:
# Remove unneeded columns
column_remove =['MAJOR_COLL', 'CLSFN_YR', 'LEVEL', 'OFFER_DEPT_ABRVN', 'SECT', 'CRSE_TITLE', 'DEG_DATEDB', 'DEG_DESCR', 
                'LAST_RGST_TERM','CLSFN_YR.1', 'MAJOR_COLL.1', 'Workday Enrolled in Fall 2024', 
                'Workday Enrolled in Fall 2024 Class Standing', 'Workday Enrolled in Fall 2024 Primary Program of Study']
for x in column_remove:
    df = df.drop(x,axis=1)

In [5]:
# Recode semesters as follows: Spring = 0.1, Summer = 0.2, Fall = 0.3
sem_list = ['ENTRY_SEM_CD', 'SEM_CD', 'SEM_CD.1']
sem_recode = {'S': 0.1, '1': 0.2, 'F': 0.3}
for x in sem_list:
    df[x] = df[x].map(pd.Series(sem_recode))

In [6]:
# Create new columns to assign unique numbers to entry term, term course was taken, and graduation term (if applicable)
# Number will be XX.Y where XX is the last two digits of the year and Y is the semester
df['ENT_SEM'] = (df['ENTRY_CCYY'] % 1000) + df['ENTRY_SEM_CD']
df['CRSE_SEM'] = (df['SEM_CCYY'] % 1000) + df['SEM_CD']
df['GRAD_SEM'] = (df['SEM_CCYY.1'] % 1000) + df['SEM_CD.1']

In [7]:
# Remove the year and semester columns after recoding
year_sem_remove = ['ENTRY_CCYY','ENTRY_SEM_CD','SEM_CCYY','SEM_CD','SEM_CCYY.1','SEM_CD.1']
for x in year_sem_remove:
    df = df.drop(x,axis=1)

In [8]:
# Get list of grade types
# df['GRADE_CATGORY'].unique()

In [9]:
# Recode grades as follows: C- or below = -1, Above C- and Satisfactory/Pass = 1
grade_recode = {'C- OR BELOW':-1,'ABOVE C-':1,'Satisfactory/Pass':1}
df['GRADE_CATGORY'] = df['GRADE_CATGORY'].map(pd.Series(grade_recode))

In [10]:
# Get list of degrees
#df['DEG_CD'].unique()

In [11]:
# Get the initial number of students
print(df["STUDENT"].nunique())

13065


In [12]:
# Remove any graduate degress
degree_remove = ['MS', 'MED', 'MAT', 'MEN', 'MFN', 'MBS', 'PHD', 'MFS', 'MHC', 'MAA', 'MBA', 'MFA', 'MA', 'MTR', 'MHO', 'DVM', 'MCP','MRE']
for x in degree_remove:
    df = df.drop(df[df['DEG_CD'] == x].index)

In [13]:
# Number of students check
print(df["STUDENT"].nunique())

13052


In [14]:
# Remove the degree column as we no longer need it
df = df.drop('DEG_CD',axis=1)

In [15]:
# Remove students that enrolled before Summer 2010
df = df[ df['ENT_SEM'] >= 10.2] 

In [16]:
# Number of students check
print(df["STUDENT"].nunique())

12973


In [17]:
# Remove students that enrolled after Fall 2020
df = df[ (df['ENT_SEM'] < 21)] 

In [18]:
# Number of students check
print(df["STUDENT"].nunique())

9958


In [19]:
# Remove students that graduated before 2014
df = df[(df['GRAD_SEM'] >= 14.1) | (df['GRAD_SEM'].isnull())]

In [20]:
# Number of students check
print(df["STUDENT"].nunique())

9955


In [21]:
# Remove courses taken after graduating
df = df[  ( df['CRSE_SEM'] <= df['GRAD_SEM']  ) |  (df['GRAD_SEM'].isnull())  ] 

In [22]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

9935
23340


In [23]:
# Sanity check - make sure courses are taken after enrollment and enrollment occurs before graduation
df = df[  ( df['ENT_SEM'] <= df['CRSE_SEM']  )  & (  (df['ENT_SEM'] <= df['GRAD_SEM']) |   (df['GRAD_SEM'].isnull())   )  ]

In [24]:
# Number of students/rows check - nothing should change
print(df["STUDENT"].nunique())
print(len(df))

9935
23340


In [25]:
# Next we want to remove courses taken above the ``at most 8 full semesters attended'' window 
# and students that graduated below the ``at least 6 full semesters attended'' window
# To accurately guage this, we first recode ENT_SEM, CRSE_SEM, and GRAD_SEM

ent_recode = {10.2 : 1, 10.3: 1,
                11.1 : 2, 11.2 : 3, 11.3 : 3, 
                12.1 : 4, 12.2 : 5, 12.3 : 5,
                13.1 : 6, 13.2 : 7, 13.3 : 7,
                14.1 : 8, 14.2 : 9, 14.3 : 9, 
                15.1 : 10, 15.2 : 11, 15.3 : 11,
                16.1 : 12, 16.2 : 13, 16.3 : 13,
                17.1 : 14, 17.2 : 15, 17.3 : 15,
                18.1 : 16, 18.2 : 17, 18.3 : 17,
                19.1 : 18, 19.2 : 19, 19.3 : 19,
                20.1 : 20, 20.2 : 21, 20.3: 21
               }

crse_recode =  {14.1 : 9, 14.2 : 9.5, 14.3 : 10, 
                15.1 : 11, 15.2 : 11.5, 15.3 : 12,
                16.1 : 13, 16.2 : 13.5, 16.3 : 14,
                17.1 : 15, 17.2 : 15.5, 17.3 : 16,
                18.1 : 17, 18.2 : 17.5, 18.3 : 18,
                19.1 : 19, 19.2 : 19.5, 19.3 : 20,
                20.1 : 21, 20.2 : 21.5, 20.3: 22,
                21.1 : 23, 21.2 : 23.5, 21.3 : 24,
                22.1 : 25, 22.2 : 25.5, 22.3 : 26,
                23.1 : 27, 23.2 : 27.5, 23.3 : 28,
                24.1 : 29, 24.2 : 29.5
               }

grad_recode = {14.1 : 9, 14.2 : 10, 14.3 : 10, 
                15.1 : 11, 15.2 : 12, 15.3 : 12,
                16.1 : 13, 16.2 : 14, 16.3 : 14,
                17.1 : 15, 17.2 : 16, 17.3 : 16,
                18.1 : 17, 18.2 : 18, 18.3 : 18,
                19.1 : 19, 19.2 : 20, 19.3 : 20,
                20.1 : 21, 20.2 : 22, 20.3: 22,
                21.1 : 23, 21.2 : 24, 21.3 : 24,
                22.1 : 25, 22.2 : 26, 22.3 : 26,
                23.1 : 27, 23.2 : 28, 23.3 : 28,
                24.1 : 29, 24.2 : 30
               }

# There was probably a clever way to do this, but whatever it works

df['ENT_SEM'] = df['ENT_SEM'].map(pd.Series(ent_recode))
df['CRSE_SEM'] = df['CRSE_SEM'].map(pd.Series(crse_recode))
df['GRAD_SEM'] = df['GRAD_SEM'].map(pd.Series(grad_recode))

# Update the course semester and the grad semester to now be relative to the entry semester
df['CRSE_SEM'] = df['CRSE_SEM'] - df['ENT_SEM']
df['GRAD_SEM'] = df['GRAD_SEM'] - df['ENT_SEM']

# Remove courses taken after 8 full semesters
df = df[ df['CRSE_SEM'] < 8.1  ]

# Remove students that graduated before 6 full semesters
df = df [ (df['GRAD_SEM'] >= 6) | (df['GRAD_SEM'].isnull()) ]

In [26]:
# Number of students/rows check
print(df["STUDENT"].nunique())
print(len(df))

9336
21494


In [27]:
# Remove the ENT_SEM column as it is no longer needed
df.drop('ENT_SEM',axis=1,inplace=True)

Unnamed: 0,STUDENT,MAJOR_CURR,CRSE,GRADE_CATGORY,MAJOR_CURR.1,MAJOR_CURR.2,CRSE_SEM,GRAD_SEM
0,0,CHEM,265,-1,CHEM,CHEM,3.0,8.0
5,3,LAS S,166,1,,LAS S,0.5,
6,4,COM S,165,1,COM S,COM S,1.0,8.0
7,4,COM S,166,1,COM S,COM S,2.0,8.0
8,4,COM S,207,1,COM S,COM S,4.0,8.0
...,...,...,...,...,...,...,...,...
30370,13060,PSYCH,140,-1,ENGL,HIST,1.0,10.0
30371,13060,PSYCH,150,-1,ENGL,HIST,2.5,10.0
30372,13061,PSYCH,150,1,,PSYCH,2.0,
30375,13063,ADVRT,151,1,MKT,MKT,5.0,9.0


In [28]:
# Next goal is to remove the repeated information due to students with multiple degrees
# First, create a column that indicates the students with multiple degrees
df['NUM_DEG'] = df.groupby('STUDENT')['MAJOR_CURR.1'].transform('nunique')

# Second, we flag those students where MAJOR_CURR.2 matches some entry in MAJOR_CURR.1
df['DEG_FLAG'] = df.groupby('STUDENT',group_keys=False).apply(lambda x: x['MAJOR_CURR.2'].isin(x['MAJOR_CURR.1']),include_groups=False)

# Next, we will create a column that either lists the `first' major/degree of the student
# or has a 0 if the student has multiple degrees and the row is information regarding the second/third/fourth

def major_detect(row):
    if (row['NUM_DEG'] == 0):
        val = row['MAJOR_CURR.2']
    elif (row['NUM_DEG'] == 1):
        val = row['MAJOR_CURR.1']
    elif (row['MAJOR_CURR.2'] == row['MAJOR_CURR.1']):
        val = row['MAJOR_CURR.1']
    elif (row['DEG_FLAG'] == False) & (row['MAJOR_CURR'] == row['MAJOR_CURR.1']):
        val = row['MAJOR_CURR.1']
    else:
        val = 0
    return val

df['MD'] = df.apply(major_detect,axis=1)

# Those with a 0 in 'MD' are safe for removal
df = df.drop(df[df['MD'] == 0].index)

In [29]:
# Number of students/rows check - note number of students should not have changed
print(df["STUDENT"].nunique())
print(len(df))

9336
20231


In [31]:
# Remove the columns MAJOR_CURR, MAJOR_CURR.1, MAJOR_CURR.2, NUM_DEG, and DEG_FLAG as they are no longer needed
deg_info_remove = ['MAJOR_CURR','MAJOR_CURR.1','MAJOR_CURR.2','NUM_DEG','DEG_FLAG']
for x in deg_info_remove:
    df = df.drop(x,axis=1)

In [32]:
# Get list of courses
# crse_list = df['CRSE'].unique()
# np.sort(crse_list)

In [33]:
# Remove courses that are not in the current catalog, have variable content, are not `content courses',
# are not math department courses, are supplemental courses, and are graduate courses.
course_remove = ['25', '30', 
                 '101', '106', '106X', '139X', '181', '182', '195', '196', 
                 '202', '202X', '241X', '268', '290', '297', 
                 '331', '341', '342', '392X', '397', '398', 
                 '408X', '421', '439', '474', '474X', '490', '490H', '491', '492', '495', '495X', '497', 
                 '501', '502', '503X', '504', '505', '506X', '507', '510', '511', '515',  '516', '518X', 
                 '519', '520', '525', '533', '535', '545', '554', '561', '562', '565', '566', '567', '581', 
                 '601', '603X', '605', '608', '610', '617', '618', '619X', '624', '631', '633', '642', '656', '666']
for x in course_remove:
    df = df.drop(df[df['CRSE'] == x].index)

In [34]:
# Number of students check
print(df["STUDENT"].nunique())

9006


In [35]:
# Remove X (experimental) and H (honors section)
df['CRSE'] = df['CRSE'].str.replace('X', '')
df['CRSE'] = df['CRSE'].str.replace('H', '')

In [36]:
# Relabel 403 as 302 (abstract algebra 2) and 142 as 145 (trigonmetry)
df['CRSE'] = df['CRSE'].replace('403', '302')
df['CRSE'] = df['CRSE'].replace('142', '145')

In [37]:
# Check that there are no more duplicated rows
dup_check = df[['STUDENT','CRSE', 'CRSE_SEM']]
dup_check.duplicated().sum()

np.int64(0)

In [38]:
# Final cleaning task is to remove courses with ``low'' enrollment
# We will determine this by getting a count of the number of times a course appears
# Put it in decesending order, then divide the cumsum by the sum
crse_value = df['CRSE'].value_counts()
crse_value.cumsum()/crse_value.sum()

CRSE
165    0.154496
166    0.283279
265    0.373191
143    0.459046
140    0.527906
104    0.585746
150    0.641996
207    0.690241
201    0.728838
267    0.764748
317    0.798958
105    0.826480
301    0.849671
266    0.871491
414    0.893037
160    0.911952
385    0.923300
435    0.933224
436    0.942599
314    0.950987
304    0.956963
350    0.962829
373    0.968586
151    0.973629
365    0.978399
415    0.983114
145    0.987774
302    0.991667
240    0.994353
481    0.996327
407    0.997478
424    0.998575
441    0.999013
442    0.999342
469    0.999561
422    0.999781
423    1.000000
Name: count, dtype: float64

In [39]:
# In light of the previous cell, we will cut out every course above 302 because at that point we have accounted for 99% of the data
final_cut = ['240','481','407', '424', '441', '442', '469', '422','423']
for x in final_cut:
    df = df.drop(df[df['CRSE'] == x].index)

In [40]:
# Final number of students check
print(df["STUDENT"].nunique())

9002


In [41]:
# We add one column for each course and enter either a 0 (if course is not taken) or 
# the value of 'GRADE_CATGORY' * 'CRSE_SEM' (this will range from -8 to 8)
courses = df['CRSE'].unique().tolist()
courses.sort()
for x in courses:
    df[x] = np.where(df['CRSE'] == x, df['GRADE_CATGORY'] * df['CRSE_SEM'], 0) 

In [45]:
# Remove the CRSE, GRADE_CATGORY, and CRSE_SEM columns as they are no longer needed
crse_info_remove = ['CRSE','GRADE_CATGORY', 'CRSE_SEM']
for x in crse_info_remove:
    df = df.drop(x,axis=1)

In [47]:
# We add the final column indicated if a student gradudated within 8 full semesters (0 = no, 1 = yes)
df['GRAD'] = np.where(df['GRAD_SEM'] <= 8, 1, 0)

Unnamed: 0,STUDENT,GRAD_SEM,MD,104,105,140,143,145,150,151,...,317,350,365,373,385,414,415,435,436,GRAD
0,0,8.0,CHEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,3,,LAS S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,4,8.0,COM S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,4,8.0,COM S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,4,8.0,COM S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30370,13060,10.0,ENGL,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
30371,13060,10.0,ENGL,0.0,0.0,0.0,0.0,0.0,-2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
30372,13061,,PSYCH,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
30375,13063,9.0,MKT,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [48]:
# Remove the GRAD_SEM colum as it is no longer needed
df.drop('GRAD_SEM',axis=1,inplace=True)

In [52]:
# We now group by student, choosing the max value in each column
df = df.groupby('STUDENT').agg({
    'value_str': 'first',
    'value_int': abs().idmax() 
}).reset_index()

TypeError: abs() takes exactly one argument (0 given)

In [43]:
df.to_csv('lee_final_dataset_11_16.csv')