In [1]:
#%pip install tabula-py

In [2]:
import pandas as pd
import numpy as np
import re
from tabula import read_pdf

## 1. Upload Data Source

#### Origin: Data Science Course Catalog

In [3]:
data_science_file = read_pdf("../data/origin/MK_MMDS_2022_23_neue_PO_29082022.pdf", pages="all")
data_science_file_copy = data_science_file.copy()
len(data_science_file)

59

In [4]:
# Keep tables with information (>2)
df = []
for table in data_science_file_copy:
    if len(table)>2:
        df.append(table)
len(df)

26

## 2. Data Preprocessing

Data Frame Cleaning
1. Remove ['Language', 'ECTS', 'Page'] from df[x]
2. Promote first row as header
3. Insert module category as a new feature

### Fundamentals Courses

In [5]:
df[1]
data_fundamentals = df[1].loc[:,~df[1].columns.isin(['Language', 'ECTS', 'Page','Unnamed: 2'])].copy()
data_fundamentals.columns = data_fundamentals.iloc[0]
data_fundamentals = data_fundamentals[2:]
data_fundamentals.columns = data_fundamentals.columns.fillna('module_code')
data_fundamentals['moduleCategory'] = 'Fundamentals'
data_fundamentals

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory
2,CS 450,Programming Course,HWS,E,6,Fundamentals
3,CS 460,Database Technology,FSS,E,6,Fundamentals
4,CS 470,Python for Data Scientists,FSS,E,6,Fundamentals
5,,Multivariate Analyses,HWS,E,6,Fundamentals
6,,Tutorial Multivariate Analyses,HWS,E,2,Fundamentals
7,,Empirische Methoden der Politik-\rwissenschaft,HWS,G/E,6,Fundamentals


### Data Management Courses

In [6]:
df[8]
data_management = df[8].loc[:,~df[8].columns.isin(['Language', 'ECTS', 'Page'])].copy()
data_management.columns = data_management.iloc[0]
data_management = data_management[2:]
data_management.columns = data_management.columns.fillna('module_code')
data_management['moduleCategory'] = 'Data Management'
data_management

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory
2,AC 651,Additional Course – Data Manage-\rment,HWS/FSS,E,AC\r651*,Data Management
3,CS 500,Advanced Software Engineering,HWS,E,6,Data Management
4,CS 530,Database Systems II,FSS,E,6,Data Management
5,CS 550,Algorithmics,FSS/HWS,E,6,Data Management
6,CS 560,Large Scale Data Management,HWS,E,6,Data Management
7,CS 600,Model-driven Development,HWS,E,6,Data Management
8,CS 662**,Types and Programming Languages,HWS,E,6,Data Management
9,IS 540**,Management of Enterprise\rSystems,HWS,E,6,Data Management
10,IS 556**,Public Blockchains,FSS,E,3,Data Management
11,IE 630,Query Optimization,FSS,E,6,Data Management


### Data Analytics Courses

In [7]:
#Data Analystics
df[10]
data_analytics = df[10].loc[:,~df[10].columns.isin(['Language', 'ECTS', 'Page'])].copy()
data_analytics.columns = data_analytics.iloc[0]
data_analytics = data_analytics[2:]
data_analytics.columns = data_analytics.columns.fillna('module_code')
data_analytics['moduleCategory'] = 'Data Analytics'
data_analytics

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory
2,AC 652,Additional Course – Data Analytics\rMethods,HWS/FSS,E,AC\r652*,Data Analytics
3,DA 110,Computational Analysis of\rCommunication,HWS,E,6,Data Analytics
4,IE 500,Data Mining I,HWS/FSS,E,6,Data Analytics
5,IE 560,Decision Support,HWS,E,6,Data Analytics
6,IE 661,Text Analytics,HWS,E,6,Data Analytics
7,IE 671,Web Mining,FSS,E,3,Data Analytics
8,IE 672,Data Mining II,FSS,E,6,Data Analytics
9,IE 675b,Machine Learning,HWS,E,9,Data Analytics
10,IE 676,Network Analysis replaced by IS\r622 (FSS),HWS,E,6,Data Analytics
11,IE 678,Deep Learning,FSS,E,6,Data Analytics


### Responsible Data Science Courses

In [8]:
df[15]
data_responsible = df[15].loc[:,~df[15].columns.isin(['Language', 'ECTS', 'Page'])].copy()
data_responsible.columns = data_responsible.iloc[0]
data_responsible = data_responsible[2:]
data_responsible.columns = data_responsible.columns.fillna('module_code')
data_responsible['moduleCategory'] = 'Responsible Data Science'
data_responsible

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory
2,CS 652,Data Security and Privacy,FSS,E,6,Responsible Data Science
3,,Legal and Ethical Aspects of Privacy,HWS,E,3,Responsible Data Science
4,CS 718,AI and Data Science in Fiction and\rSociety,HWS,E,4,Responsible Data Science


### Consolidate Courses

In [9]:
final_frame = pd.concat([data_fundamentals,data_management,data_analytics,data_responsible],ignore_index=True,axis=0)

# Removing '\r' from the data
mask_r = final_frame['Name of Module'].str.contains('\r')
final_frame[mask_r]

# Final df cleaning
features = ['Name of Module','ECTS']
for col in features:
    final_frame.loc[mask_r,col] = final_frame.loc[mask_r,col].str.replace('\r',' ')
final_frame[mask_r]
for col in features:
    final_frame.loc[mask_r,col] = final_frame.loc[mask_r,col].str.replace('- ','')
final_frame[mask_r]

#Clean module_code
final_frame.module_code = final_frame.module_code.str.rstrip('**')
final_frame.module_code = final_frame.module_code.str.rstrip()
final_frame.module_code = final_frame.module_code.str.lstrip()
final_frame

final_frame

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory
0,CS 450,Programming Course,HWS,E,6,Fundamentals
1,CS 460,Database Technology,FSS,E,6,Fundamentals
2,CS 470,Python for Data Scientists,FSS,E,6,Fundamentals
3,,Multivariate Analyses,HWS,E,6,Fundamentals
4,,Tutorial Multivariate Analyses,HWS,E,2,Fundamentals
5,,Empirische Methoden der Politikwissenschaft,HWS,G/E,6,Fundamentals
6,AC 651,Additional Course – Data Management,HWS/FSS,E,AC 651*,Data Management
7,CS 500,Advanced Software Engineering,HWS,E,6,Data Management
8,CS 530,Database Systems II,FSS,E,6,Data Management
9,CS 550,Algorithmics,FSS/HWS,E,6,Data Management


## 3. Save MMDS_Courses

In [10]:
# Save file
final_frame.to_csv('../data/processed/mmds_courses.csv',index=False)