In [7]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('df_clean.csv')

# Display the column names
print(df.columns.tolist())
print(len(df))

# Display the first 5 rows in a nice table
display(df.head())

['url', 'title', 'volume', 'education', 'content', 'learning_outcome', 'literature', 'recommended_prereq', 'teaching_methods', 'workload', 'feedback_form', 'signup', 'exam_html', 'language', 'course_code', 'ects', 'level', 'duration', 'placement', 'schedule', 'capacity', 'study_board', 'department', 'faculty', 'course_coordinators', 'last_modified', 'term']
792


Unnamed: 0,url,title,volume,education,content,learning_outcome,literature,recommended_prereq,teaching_methods,workload,...,duration,placement,schedule,capacity,study_board,department,faculty,course_coordinators,last_modified,term
0,https://kurser.ku.dk/course/ndak15006u/2024-2025,NDAK15006U Advanced Computer Systems (ACS),Volume 2024/2025,MSc Programme in Computer Science,The overall purpose of this course is to offer...,Knowledge to Describe the design of transactio...,See Absalon when the course is set up.,- Basic principles of operating systems and/or...,Lectures and exercise sessions.,"{'Lectures': 42, 'Theory exercises': 70, 'Proj...",...,1 block,Block 2,A,No limitation – unless you register in the lat...,['Study Board of Mathematics and Computer Scie...,['Department of Computer Science'],['Faculty of Science'],['Panagiotis Karras (4-7566706645696e33707a336...,Saved on the 05-07-2024,Volume 2024/2025_Block 2
1,https://kurser.ku.dk/course/ndaa09023u/2024-2025,NDAA09023U Advanced Algorithms and Data Struct...,Volume 2024/2025,MSc Programme in Computer Science MSc Programm...,Algorithms is about finding scalable solutions...,Knowledge of Graph algorithms such as max flow...,See Absalon when the course is set up.,It is assumed that the students are familiar w...,A mix of lectures and exercises.,"{'Lectures': 36, 'Preparation': 82, 'Theory ex...",...,1 block,Block 2,C,No limitation – unless you register in the lat...,['Study Board of Mathematics and Computer Scie...,['Department of Computer Science'],['Faculty of Science'],['Amir Yehudayoff (4-6975816d486c7136737d366c7...,Saved on the 20-01-2025,Volume 2024/2025_Block 2
2,https://kurser.ku.dk/course/ndaa09023u/2025-2026,NDAA09023U Advanced Algorithms and Data Struct...,Volume 2025/2026,MSc Programme in Computer Science MSc Programm...,Algorithms is about finding scalable solutions...,Knowledge of Graph algorithms such as max flow...,See Absalon when the course is set up.,It is assumed that the students are familiar w...,A mix of lectures and exercises.,"{'Lectures': 36, 'Preparation': 82, 'Theory ex...",...,1 block,Block 2,C,No limitation – unless you register in the lat...,['Study Board of Mathematics and Computer Scie...,['Department of Computer Science'],['Faculty of Science'],['Jacob Holm (4-6d646b7243676c316e7831676e)'],Saved on the 24-02-2025,Volume 2025/2026_Block 2
3,https://kurser.ku.dk/course/ndak15006u/2025-2026,NDAK15006U Advanced Computer Systems (ACS),Volume 2025/2026,MSc Programme in Computer Science,The overall purpose of this course is to offer...,Knowledge to Describe the design of transactio...,See Absalon when the course is set up.,- Basic principles of operating systems and/or...,Lectures and exercise sessions.,"{'Lectures': 42, 'Theory exercises': 70, 'Proj...",...,1 block,Block 2,A,No limitation – unless you register in the lat...,['Study Board of Mathematics and Computer Scie...,['Department of Computer Science'],['Faculty of Science'],['Panagiotis Karras (4-72636d6342666b306d77306...,Saved on the 24-02-2025,Volume 2025/2026_Block 2
4,https://kurser.ku.dk/course/ndaa09013u/2025-2026,NDAA09013U Advanced Programming (AP),Volume 2025/2026,MSc Programme in Computer Science,The purpose of this course is to provide pract...,"At course completion, the successful student w...",See Absalon when the course is set up.,Programming ability in at least two substantia...,"Lectures, mandatory assignments, exercise labs.","{'Lectures': 29, 'Preparation': 124, 'Practica...",...,1 block,Block 1,A,No limitation – unless you register in the lat...,['Study Board of Mathematics and Computer Scie...,['Department of Computer Science'],['Faculty of Science'],['Troels Henriksen (5-71847871835074793e7b853e...,Saved on the 01-05-2025,Volume 2025/2026_Block 1


In [8]:
def split_coordinator(row):
    match = re.match(r"\['(.*)\s\((.*)\)'\]", str(row))
    if match:
        return pd.Series({'course_coordinator_name': match.group(1), 'course_coordinator_id': match.group(2)})
    else:
        return pd.Series({'course_coordinator_name': None, 'course_coordinator_id': None})

split_df = df['course_coordinators'].apply(split_coordinator)
df['course_coordinator_name'] = split_df['course_coordinator_name']
df['course_coordinator_id'] = split_df['course_coordinator_id']

# Now df has two new columns
display(df[['course_coordinator_name', 'course_coordinator_id']].head())
df.to_csv('coordinator_split.csv', index=False)

Unnamed: 0,course_coordinator_name,course_coordinator_id
0,Panagiotis Karras,4-7566706645696e33707a336970
1,Amir Yehudayoff,4-6975816d486c7136737d366c73
2,Jacob Holm,4-6d646b7243676c316e7831676e
3,Panagiotis Karras,4-72636d6342666b306d7730666d
4,Troels Henriksen,5-71847871835074793e7b853e747b


In [9]:
def unique_code_kuc_all_columns(df):
    """
    Returns a new DataFrame with all columns, keeping only the first occurrence of each unique code_kuc.
    """
    return df.drop_duplicates(subset=['course_code']).reset_index(drop=True)

# Example usage:
unique_df = unique_code_kuc_all_columns(df)
display(unique_df['course_coordinator_name'])
print(type(unique_df['course_coordinator_name'][0]))

0                                     Panagiotis Karras
1                                       Amir Yehudayoff
2                                      Troels Henriksen
3                                         Stefan Sommer
4                               Joanna Emilia Bergström
                            ...                        
68                                  Daniel Lee Ashbrook
69                              Joanna Emilia Bergström
70    Serge Belongie (10-763165686f72716a6c6843676c3...
71                                        Maria Maistro
72                                       Kasper Hornbæk
Name: course_coordinator_name, Length: 73, dtype: object

<class 'str'>


In [10]:
unique_df.to_csv('unique_courses.csv', index=False)

In [11]:
print(unique_df['content'].str.len().max())

3231
