# EduTrain curriculum
(enviroment: FAIR-OER python3.8.18)

Following imports make the functionalities of rest of the code possible.

In [47]:
#libraries
import pandas as pd
import json
import os
import re
import csv
import yaml
from pathlib import Path
import random
import pdftotext
import pandas as pd
import ast
from googletrans import Translator
from langdetect import detect, LangDetectException
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import torch.nn.functional as F
import rdflib
from rdflib import Namespace, Graph, URIRef, Literal, XSD, RDF, RDFS, OWL, DCTERMS
import networkx as nx
from pyvis.network import Network
import textwrap

## Data acquisition 

In [None]:
#pathes
eo4geo_json = r"B:\LMS\ETKG\Data\EO4GEO\EO4GEO - cleaned.json"
eo4geo_csv = r"B:\LMS\ETKG\Data\EO4GEO\EO4GEO - cleaned.csv"
earthlab_dir = r"B:\LMS\Courses\earthlab"
earthlab_csv = r"B:\LMS\ETKG\Data\EarthLab\earthlab.csv"

## Data preprocessing
Data sources are:
1. EO4GEO
2. EarthLab

### EO4GEO

In [None]:
with open(eo4geo_json, 'r') as file:
    data = json.load(file)

rows = []
for key, entry in data.items():
    if 'relations' in entry:
        for relation in entry['relations']:
            row = {
                'id': entry.get('id', ''),
                'name': entry.get('name', ''),
                'relation_name': relation.get('name', ''),
                'relation_source': relation.get('source', ''),
                'relation_target': relation.get('target', ''),
                'skills': ", ".join(entry.get('skills', [])),
                'uri': entry.get('uri', '')
            }
            rows.append(row)

df = pd.DataFrame(rows)
df.to_csv(eo4geo_csv, index=False)

### EarthLab

In [None]:
def earthLab(md_path):
    with open(md_path, 'r', encoding='utf-8') as file:
        content = file.read()
        parts = content.split('---')
        metadata = yaml.load(parts[1], Loader=yaml.FullLoader)
        description = content.split("<div class=")[1].split("</div>")[0].strip()
        if "What You Need" in content:
            prerequisites = content.split("What You Need")[1].split("</div>")[0].strip()
        else:
            prerequisites = "none"
        metadata['description'] = description
        metadata['prerequisites'] = prerequisites         
        return metadata

In [None]:
def main(directory_path, csv_path):
    md_files = Path(directory_path).rglob('*.md')
    all_metadata = []
    
    for md_file in md_files:
        try:
            md_data = earthLab(md_file)
            if md_data:
                all_metadata.append(md_data)
        except Exception as e:
            print(f"Skipping {md_file}: This is not an educational file. Error: {e}")

    if not all_metadata:
        print("No valid Markdown files processed.")
        return

    fieldnames = set()
    for metadata in all_metadata:
        fieldnames.update(metadata.keys())

    fieldnames = sorted(fieldnames)

    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for metadata in all_metadata:
            row = {key: json.dumps(value) if isinstance(value, dict) else value for key, value in metadata.items()}
            writer.writerow(row)
    return all_metadata

In [None]:
main(earthlab_dir, earthlab_csv)

### TUM

In [None]:
def pdftocsv(pdf_path, start=1, end=None, header=0, footer=0, keywords={}):
    # Paths
    text_path = pdf_path.replace('.pdf', '.txt')
    csv_path = pdf_path.replace('.pdf', '.csv')
    # Read PDF
    with open(pdf_path, 'rb') as pdf_file:
        pdf = pdftotext.PDF(pdf_file)
        end = end or len(pdf)
    # Write relevant PDF content to a text file
    with open(text_path, 'w', encoding='utf-8') as pdf_content:
        for page_number in range(start - 1, end):
            lines = pdf[page_number].splitlines()
            main_lines = lines[header:-footer or None]
            main_content = '\n'.join(main_lines)
            pdf_content.write(main_content + '\n')
    # Extract course information and save it as CSV
    with open(text_path, 'r') as file:
        content = file.read()
        data = {}
        max_length = 0
        for keyword_one, keyword_two in keywords.items():
            sections = content.split(keyword_one)[1:]
            matches = [section.split(keyword_two)[0] for section in sections]
            data[keyword_one] = matches
            max_length = max(max_length, len(matches))
        for key in data:
            while len(data[key]) < max_length:
                data[key].append('N/A')
        df = pd.DataFrame(data)
    df.to_csv(csv_path, index=False)
    return df

In [None]:
data_summary = pd.read_csv(r"B:\Publications\FAIR OER\02-Data\00-Raw\data_summary.csv")
for index, row in data_summary.iterrows():
    pdf_path = row['pdf_path']
    start = row['start']
    end = row['end']
    header = row['header']
    footer = row['footer']
    keywords = ast.literal_eval(row['keywords'])
    pdftocsv(pdf_path, start, end, header, footer, keywords)

In [None]:
courses = pd.concat([pd.read_csv(pdf_path.replace('.pdf', '.csv')) for pdf_path in data_summary['pdf_path']], ignore_index=True)
courses.to_csv(r"B:\Publications\FAIR OER\02-Data\01-Processed\courses.csv", index=False)

In [None]:
def DEtoEN(csv_path):
    output_path = csv_path.replace('.csv', '_en.csv')
    df = pd.read_csv(csv_path)
    translator = Translator()
    for i in range(len(df)):
        for j in range(len(df.columns)):
            text = df.iat[i, j]
            if pd.notna(text):
                try:
                    lang = detect(text)
                except LangDetectException:
                    print(f"Cannot detect language for text: {text[:30]}...")
                    lang = None
                if lang == 'de':
                    df.iat[i, j] = translator.translate(text, src='de', dest='en').text
                else:
                    df.iat[i, j] = text
    df.to_csv(output_path, index=False)
    return df

In [None]:
DEtoEN(r'B:\Publications\FAIR OER\02-Data\01-Processed\courses_clean.csv')

In [None]:
def normalization(csv_path):
    normalized_path = csv_path.replace('.csv', '_norm.csv')
    df = pd.read_csv(csv_path)

    df = df.applymap(lambda s: s.replace('\n', ' ').replace('\t', ' ') if type(s) == str else s)
    df = df.applymap(lambda s: s.lower() if type(s) == str else s)
    df = df.replace({r'[^\w\s]':''}, regex=True)

    df.to_csv(normalized_path, index=False)
    return df

In [None]:
normalization(r'B:\Publications\FAIR OER\02-Data\01-Processed\courses_clean_en.csv')

In [None]:
def merge_columns(csv_path, column_list, prerequisite_column):
    merged_path = csv_path.replace('.csv', '_merged.csv')
    df = pd.read_csv(csv_path)
    df_merged = pd.DataFrame()
    df_merged['course'] = df.iloc[:, column_list].astype(str).apply('\n'.join, axis=1)
    df_merged['prerequisites'] = df.iloc[:, prerequisite_column]
    df_merged.to_csv(merged_path, index=False)
    return df_merged

In [None]:
merge_columns(r'B:\Publications\FAIR OER\02-Data\01-Processed\courses_clean_en_norm.csv', [0, 2, 3], 1)

In [None]:
checkpoints = ["sentence-transformers/all-mpnet-base-v2", "sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L12-v2"]

data = pd.read_csv(r'B:\Publications\FAIR OER\02-Data\01-Processed\courses_clean_en_norm_merged.csv')
data['course'].fillna('', inplace=True)
data['prerequisites'].fillna('', inplace=True)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

for checkpoint in checkpoints:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModel.from_pretrained(checkpoint)
    model.eval()
    course = data['course'].tolist()
    prerequisite = data['prerequisites'].tolist()
    
    course_tokens = []
    prerequisite_tokens = []
    for i in range(len(course)):
        course_tokens.append(tokenizer(course[i], padding="max_length", truncation=True, max_length=512, return_tensors="pt"))
        prerequisite_tokens.append(tokenizer(prerequisite[i], padding="max_length", truncation=True, max_length=512, return_tensors="pt"))
    
    course_embeddings = []
    prerequisite_embeddings = []
    for input in course_tokens:
        with torch.no_grad():
            outputs = model(**input)
            course_embeddings.append(outputs)
    for input in prerequisite_tokens:
        with torch.no_grad():
            outputs = model(**input)
            prerequisite_embeddings.append(outputs)
    
    course_pooled_embeddings = []
    prerequisite_pooled_embeddings = []
    for i, embeddings in enumerate(course_embeddings):
        mean_pooled = mean_pooling(embeddings, course_tokens[i]['attention_mask'])
        course_pooled_embeddings.append(F.normalize(mean_pooled, p=2, dim=1))
    for i, embeddings in enumerate(prerequisite_embeddings):
        mean_pooled = mean_pooling(embeddings, prerequisite_tokens[i]['attention_mask'])
        prerequisite_pooled_embeddings.append(F.normalize(mean_pooled, p=2, dim=1))
    
    similarity_scores = np.zeros((len(prerequisite_pooled_embeddings), len(course_pooled_embeddings)))
    for i, p in enumerate(prerequisite_pooled_embeddings):
        for j, c in enumerate(course_pooled_embeddings):
            p_expanded = p.squeeze()
            c_expanded = c.squeeze()
            similarity = F.cosine_similarity(p_expanded, c_expanded, dim=0)
            similarity_scores[i][j] = similarity.item()
            similarity_df = pd.DataFrame(similarity_scores)
            similarity_df.to_csv(rf'B:\Publications\FAIR OER\02-Data\01-Processed\similarity_{checkpoint.replace("/", "-")}.csv', index=False)

In [None]:
similarity_df = pd.DataFrame(similarity_scores)
similarity_df.to_csv(r"B:\Workspace\FAIR OER\Codes\final\similarity_title.csv", index=False)

In [None]:
df=pd.read_csv(r"B:\Workspace\FAIR OER\Codes\final\similarity_title.csv")
courses=pd.read_csv(r"B:\Workspace\FAIR OER\Codes\final\courses_clean_en_norm.csv")
courses

In [None]:
df = df.where(df >= 0.7)
np.fill_diagonal(df.values, np.nan)
df

In [None]:
for i in range(len(df)):
    for j in range(len(df.columns)):
        if df.iloc[i,j] > 0:
            df.iloc[i,j] = courses.iloc[j,0]

In [None]:
final = pd.concat([courses, df], axis=1)
final.to_csv(r"B:\Workspace\FAIR OER\Codes\final\final.csv", index=False)

## Data analysis

In [23]:
df = pd.read_csv(r"B:\LMS\ETKG\Data\ETKG-v2.csv")

etkg = Namespace('http://etkg.nfdi4earth.de/')
schema = Namespace('http://schema.org/')
lrmi = Namespace('http://purl.org/dcx/lrmi-terms/')

g = Graph()

g.bind('etkg', etkg)
g.bind('xsd', XSD)
g.bind('dcterms', DCTERMS)
g.bind('schema', schema)
g.bind('lrmi', lrmi)

def create_uri_component(code):
    return re.sub(r'\s+', '-', code)

def add_row_to_graph(g, row):
    course_uri= create_uri_component(str(row['code']))
    course_uri = etkg[course_uri]

    g.add((course_uri, RDF.type, schema.LearningResource))
    g.add((course_uri, DCTERMS.title, Literal(str(row['name']), datatype=XSD.string)))
    g.add((course_uri, lrmi.learningObjective, Literal(str(row['description']), datatype=XSD.string)))
    g.add((course_uri, schema.educationalLevel, Literal(str(row['cognitive domain']), datatype=XSD.string)))
    g.add((course_uri, schema.educationalLevel, Literal(str(row['difficulty']), datatype=XSD.string)))
    if pd.notnull(row['prerequisite1']):
        g.add((course_uri, schema.competencyRequired, URIRef(etkg[create_uri_component(str(row['prerequisite1']))])))
    if pd.notnull(row['prerequisite2']):
        g.add((course_uri, schema.competencyRequired, URIRef(etkg[create_uri_component(str(row['prerequisite2']))])))
    if pd.notnull(row['prerequisite3']):
        g.add((course_uri, schema.competencyRequired, URIRef(etkg[create_uri_component(str(row['prerequisite3']))])))
    if pd.notnull(row['prerequisite4']):
        g.add((course_uri, schema.competencyRequired, URIRef(etkg[create_uri_component(str(row['prerequisite4']))])))
    if pd.notnull(row['prerequisite5']):
        g.add((course_uri, schema.competencyRequired, URIRef(etkg[create_uri_component(str(row['prerequisite5']))])))
    if pd.notnull(row['prerequisite6']):
        g.add((course_uri, schema.competencyRequired, URIRef(etkg[create_uri_component(str(row['prerequisite6']))])))
    if pd.notnull(row['learning resource1']):
        g.add((course_uri, DCTERMS.source, URIRef(str(row['learning resource1']))))
    if pd.notnull(row['learning resource2']):
        g.add((course_uri, DCTERMS.source, URIRef(str(row['learning resource2']))))
    if pd.notnull(row['learning resource3']):
        g.add((course_uri, DCTERMS.source, URIRef(str(row['learning resource3']))))
    if pd.notnull(row['learning resource4']):
        g.add((course_uri, DCTERMS.source, URIRef(str(row['learning resource4']))))

for index, row in df.iterrows():
    add_row_to_graph(g, row)


g.serialize(destination=r"B:\LMS\ETKG\Data\ETKG-v2.ttl", format='turtle')


<Graph identifier=Nd7b04782b31e4ebc9b97c784b77477db (<class 'rdflib.graph.Graph'>)>

In [28]:
g = Graph()
g.parse(r"B:\LMS\ETKG\Codes\ETKG-test.ttl", format="ttl")

print(f"Number of triples: {len(g)}")

Number of triples: 397


In [107]:
dcterms = rdflib.Namespace("http://purl.org/dc/terms/")
schema = rdflib.Namespace("http://schema.org/")
etkg = rdflib.Namespace("http://etkg.nfdi4earth.de/")

nodes = {}
edges = []

for s, p, o in g:
    if p == dcterms.title:
        nodes[str(s)] = str(o)
        print(f"Node added: {s} with title {o}")

for s, p, o in g:
    if p == schema.competencyRequired:
        edges.append((str(s), str(o)))
        print(f"Edge added: {s} -> {o}")

G = nx.DiGraph()

for node, title in nodes.items():
    G.add_node(node, title=title)

for s, o in edges:
    if s in nodes and o in nodes:
        G.add_edge(s, o)

node_titles = {node: data['title'] for node, data in G.nodes(data=True)}

net = Network(notebook=True, cdn_resources='in_line')

net.toggle_physics(True)
net.show_buttons(filter_=['physics'])

for node, title in node_titles.items():
    net.add_node(node, label=title, title=title, shape='dot', size=10)

for s, o in edges:
    if s in node_titles and o in node_titles:
        net.add_edge(s, o)

net.save_graph("graph-test.html")

Node added: http://etkg.nfdi4earth.de/EL216 with title Fundamentals of pandas
Node added: http://etkg.nfdi4earth.de/EL242 with title Analyze raster data in Python
Node added: http://etkg.nfdi4earth.de/EL190 with title Introduction to open reproducible science
Node added: http://etkg.nfdi4earth.de/TUM4 with title Fundamentals of spatial data mining and machine learning 
Node added: http://etkg.nfdi4earth.de/TUM2 with title Fundamentals of big spatial data 
Node added: http://etkg.nfdi4earth.de/EL265 with title Introduction to Light Detection and Ranging (Lidar)data
Node added: http://etkg.nfdi4earth.de/DLS1GD with title Introduction to spatial data
Node added: http://etkg.nfdi4earth.de/EL290 with title Work with raster data in Python
Node added: http://etkg.nfdi4earth.de/EL127 with title Introduction to using Git for scientific projects
Node added: http://etkg.nfdi4earth.de/EL200 with title Introduction to Python
Node added: http://etkg.nfdi4earth.de/DLS1DM3-2 with title Introduction to

In [115]:
net = Network(height='100vh', width='100%')

for node, title in node_titles.items():
    wrapped_title = textwrap.fill(title, width=19)  
    net.add_node(node, label=wrapped_title, title=wrapped_title, color={'background': '#bfd6de', 'border': '#003f60'}, borderWidth=2, font={'color': '#003f60', 'size': 11}, size=37, shape='circle')

for s, o in edges:
    if s in node_titles and o in node_titles:
        net.add_edge(s, o, color='#003f60', width=2, title='Prerequisite', label='Prerequisite', arrows='to', font={'align': 'top'})

net.toggle_drag_nodes(True)
net.toggle_physics(True)
net.show_buttons(filter_=['physics'])
net.save_graph("graph-test.html")

In [129]:
additional_info = {}

for s, p, o in g:
    s_str, p_str, o_str = str(s), str(p), str(o)
    if p_str in {
        'http://purl.org/dcx/lrmi-terms/learningObjective',
        'http://purl.org/dc/terms/source',
        'http://schema.org/competencyRequired',
        'http://schema.org/educationalLevel'
    }:
        if s_str not in additional_info:
            additional_info[s_str] = {
                'learning_objectives': [],
                'sources': [],
                'prerequisites': [],
                'level': []
            }
        if p_str == 'http://purl.org/dcx/lrmi-terms/learningObjective':
            additional_info[s_str]['learning_objectives'].append(o_str)
        elif p_str == 'http://purl.org/dc/terms/source':
            additional_info[s_str]['sources'].append(o_str)
        elif p_str == 'http://schema.org/competencyRequired':
            additional_info[s_str]['prerequisites'].append(o_str)
        elif p_str == 'http://schema.org/educationalLevel':
            additional_info[s_str]['level'].append(o_str)

for node in additional_info:
    for key in additional_info[node]:
        additional_info[node][key] = ', '.join(additional_info[node][key])

In [134]:
net = Network(height='100vh', width='100%')

for node, title in nodes.items():
    wrapped_title = textwrap.fill(title, width=19)
    node_info = additional_info.get(node, {
        'learning_objectives': 'N/A',
        'sources': 'N/A',
        'prerequisites': 'N/A',
        'level': 'N/A'
    })
    info_html = f"""
    Title: 
    {title}
    Learning Objectives:
    {node_info['learning_objectives']}
    Sources:
    {node_info['sources']}
    Prerequisites: 
    {node_info['prerequisites']}
    Level: 
    {node_info['level']}
    """
    net.add_node(node, label=wrapped_title, title=info_html, color={'background': '#bfd6de', 'border': '#003f60'}, borderWidth=2, font={'color': '#003f60', 'size': 11}, size=37, shape='circle')

for s, o in edges:
    if s in nodes and o in nodes:
        net.add_edge(s, o, color='#003f60', width=2, title='Prerequisite', label='Prerequisite', arrows='to', font={'align': 'top'})

net.toggle_physics(True)
net.show_buttons(filter_=['physics'])
net.save_graph("graph-test.html")