In [14]:
import spacy
import re
import pandas as pd

# Load pre-trained SpaCy NER model
nlp = spacy.load("en_core_web_md")

# Sample data
data = {'Course Code': ['MECH 128', 'CSE 101', 'MATH 402', 'BIO 50A', 'ENG 99'],
        'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
        'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
df = pd.DataFrame(data)

# Function to parse course code using enhanced logic
def parse_course_code(course_code):
    if not course_code:
        return None, None, 'Unknown'
    
    doc = nlp(course_code)
    dept, number = None, None

    # Identify department and course number using dependency parsing
    for token in doc:
        if token.dep_ == "ROOT" and re.match(r"[A-Z]+", token.text):
            dept = token.text
        elif token.dep_ == "compound" and re.match(r"[A-Z]+", token.text):
            dept = token.text
        elif token.text.isdigit() or (token.text[:-1].isdigit() and token.text[-1].isalpha()):
            number = token.text
    
    # Fallback using regex for no spaces and hyphen-separated codes
    if not dept or not number:
        match = re.match(r"([A-Z]+)-?(\d+[A-Za-z]?)", course_code)
        if match:
            dept, number = match.groups()
    
    # Extract numeric part for level determination
    if number:
        try:
            numeric_part = int(''.join(filter(str.isdigit, number)))
            number = numeric_part
        except ValueError:
            number = None

    if number:
        if 100 <= number < 200:
            level = 'Introductory'
        elif 200 <= number < 300:
            level = 'Intermediate'
        elif 300 <= number < 400:
            level = 'Advanced'
        else:
            level = 'Graduate'
    else:
        level = 'Unknown'
    
    return dept, number, level

# Apply parsing to dataframe
df[['Department', 'Course Number', 'Level']] = df['Course Code'].apply(parse_course_code).apply(pd.Series)
print(df)

  Course Code Grades Credits Department  Course Number         Level
0    MECH 128     A+     3.0       MECH            128  Introductory
1     CSE 101     98     4.0        CSE            101  Introductory
2    MATH 402    4.3     3.0       MATH            402      Graduate
3     BIO 50A     B-     1.5        BIO             50      Graduate
4      ENG 99     C+     2.0        ENG             99      Graduate


In [15]:
# Define test cases
test_cases = [
    {'description': 'Standard Course Codes', 'data': [
        {'Course Code': ['MECH 128', 'CSE 101', 'MATH 402', 'BIO 50A', 'ENG 99'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    {'description': 'No Spaces in Course Codes', 'data': [
        {'Course Code': ['MECH128', 'CSE101', 'MATH402', 'BIO50A', 'ENG99'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    {'description': 'Hyphen-separated Course Codes', 'data': [
        {'Course Code': ['MECH-128', 'CSE-101', 'MATH-402', 'BIO-50A', 'ENG-99'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    {'description': 'Reversed Course Code Order', 'data': [
        {'Course Code': ['128 MECH', '101 CSE', '402 MATH', '50A BIO', '99 ENG'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    {'description': 'Mixed Course Codes and Titles', 'data': [
        {'Course Code': ['MECH 128 Mechanical Engineering', 'Introduction to Computing CSE 101', 
                         'Advanced Calculus MATH 402', 'BIO 50A Biology', 'ENG 99 English Literature'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    {'description': 'Course Titles Only', 'data': [
        {'Course Code': ['Mechanical Engineering', 'Introduction to Computing', 
                         'Advanced Calculus', 'Biology', 'English Literature'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    {'description': 'Mixed Tokens with Hyphens', 'data': [
        {'Course Code': ['MECH-128-Mechanical', 'CSE-101-Computing', 'MATH-402-Calculus', 
                         'BIO-50A-Biology', 'ENG-99-English'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    {'description': 'Mixed Alphanumeric Course Codes', 'data': [
        {'Course Code': ['MECH128A', 'CSE101B', 'MATH402C', 'BIO50D', 'ENG99E'],
         'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
         'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    ]},
    # {'description': 'Empty Course Code Entries', 'data': [
    #     {'Course Code': ['MECH 128', '', 'MATH 402', None, 'ENG 99'],
    #      'Grades': ['A+', '98', '4.3', 'B-', 'C+'],
    #      'Credits': ['3.0', '4.0', '3.0', '1.5', '2.0']}
    # ]}
]

# Run test cases
for test in test_cases:
    print(f"Test Case: {test['description']}")
    for data in test['data']:
        df = pd.DataFrame(data)
        df[['Department', 'Course Number', 'Level']] = df['Course Code'].apply(parse_course_code).apply(pd.Series)
        print(df)
        print()

Test Case: Standard Course Codes
  Course Code Grades Credits Department  Course Number         Level
0    MECH 128     A+     3.0       MECH            128  Introductory
1     CSE 101     98     4.0        CSE            101  Introductory
2    MATH 402    4.3     3.0       MATH            402      Graduate
3     BIO 50A     B-     1.5        BIO             50      Graduate
4      ENG 99     C+     2.0        ENG             99      Graduate

Test Case: No Spaces in Course Codes
  Course Code Grades Credits Department  Course Number         Level
0     MECH128     A+     3.0       MECH            128  Introductory
1      CSE101     98     4.0        CSE            101  Introductory
2     MATH402    4.3     3.0       MATH            402      Graduate
3      BIO50A     B-     1.5        BIO             50      Graduate
4       ENG99     C+     2.0        ENG             99      Graduate

Test Case: Hyphen-separated Course Codes
  Course Code Grades Credits Department  Course Number     

In [16]:
# Define new test cases
new_test_cases = [
    {'description': 'Standard Course Codes with Descriptions', 'data': [
        {'Course Code': ['CJ 341 Police Meth & Org', 'FSC 440 Drug Identification', 'FSC 440L Drug Id Lab', 
                         'FSC 442 Arson And Explosive', 'FSC 497CA Field Study', 'PHY 111 General Physics I', 
                         'PHY 111L Gen Phy I Lab'],
         'Grades': ['B', 'B', 'C', 'A', 'A', 'C', 'B'],
         'Credits': ['3.00', '3.00', '1.00', '3.00', '3.00', '3.00', '1.00']}
    ]},
    {'description': 'Course Codes with Special Characters', 'data': [
        {'Course Code': ['020ANGNI1 Analyse Générale', '020CHGNI1 Chimie Générale', '020GCCNI1 Le Génie Civil au Service de la Communauté',
                         '020MADNI1 Mathématiques Discrètes', '020MC1NI1 Mécanique 1', '020SPHNI1 Signaux Physiques'],
         'Grades': ['A', 'A-', 'B+', 'B', 'C', 'B+'],
         'Credits': ['6.0', '4.0', '4.0', '6.0', '6.0', '6.0']}
    ]},
    {'description': 'Alphanumeric Course Codes', 'data': [
        {'Course Code': ['ANT100Y1 Intro Anthropology', 'CSC108H1 Intro to Comp Prog', 'ECO100Y1 Intro Economics', 'MAT137Y1 Calculus!'],
         'Grades': ['IPR', 'A-', 'IPR', 'IPR'],
         'Credits': ['1.00', '0.50', '1.00', '1.00']}
    ]}
]

# Run new test cases
for test in new_test_cases:
    print(f"Test Case: {test['description']}")
    for data in test['data']:
        df = pd.DataFrame(data)
        df[['Department', 'Course Number', 'Level']] = df['Course Code'].apply(parse_course_code).apply(pd.Series)
        print(df)
        print()

Test Case: Standard Course Codes with Descriptions
                   Course Code Grades Credits      Department  Course Number  \
0     CJ 341 Police Meth & Org      B    3.00            Meth          341.0   
1  FSC 440 Drug Identification      B    3.00  Identification          440.0   
2         FSC 440L Drug Id Lab      C    1.00             Lab          440.0   
3  FSC 442 Arson And Explosive      A    3.00           Arson          442.0   
4        FSC 497CA Field Study      A    3.00           Study            NaN   
5    PHY 111 General Physics I      C    3.00               I          111.0   
6       PHY 111L Gen Phy I Lab      B    1.00             Lab          111.0   

          Level  
0      Advanced  
1      Graduate  
2      Graduate  
3      Graduate  
4       Unknown  
5  Introductory  
6  Introductory  

Test Case: Course Codes with Special Characters
                                         Course Code Grades Credits  \
0                         020ANGNI1 Analyse 