In [1]:
# Import necessary libraries
import os
import pandas as pd
import json
from git import Repo

# Clone the GitHub repository
git_url = "https://github.com/jd-coderepos/llms4subjects.git"
repo_dir = "llms4subjects"

if not os.path.exists(repo_dir):
    print("Cloning repository...")
    Repo.clone_from(git_url, repo_dir)
else:
    print("Repository already cloned.")

# Path to the target folder
data_folders = ["Article", "Book", "Conference", "Report", "Thesis"]
#data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", "tib-core-subjects", "data", "train", "Book", "en")

# Verify the folder exists
#if not os.path.exists(data_folder):
#    raise FileNotFoundError(f"The folder {data_folder} does not exist.")


Cloning repository...


In [2]:
def get_raw_dict_book(book, file_name, folder):
  raw = {}
  data = {}
  label = {}
  for item in book["@graph"]:
    if "title" in item:
      data["file_name"] = file_name
      data["folder"] = folder
      data["title"] = item["title"]
      data["abstract"] = item["abstract"]
      if ("creator" in item):
        gnd_creator = []
        if (isinstance(item["creator"], list)):
          gnd_creator = item["creator"]
        else:
          gnd_creator = [item["creator"]]
        name_creator = []
        for gnd in gnd_creator:
          for finder in book["@graph"]:
            if "@id" in finder and finder["@id"] == gnd:
              name_creator.append(finder["sameAs"])
        data["creator"] = name_creator
      if ("publisher" in item):
        data["publisher"] = item["publisher"]
      if ("@id" in item["dcterms:subject"]):
        item["dcterms:subject"] = [item["dcterms:subject"]]
      label["dcterms:subject"] = [x["@id"] for x in item["dcterms:subject"]]
      dcterms_name = []
      for dcterms in label["dcterms:subject"]:
        for finder in book["@graph"]:
          if "@id" in finder and finder["@id"] == dcterms:
            dcterms_name.append(finder["sameAs"])
      label["dcterms:subject_name"] = dcterms_name
      raw.update(data)
      raw.update(label)
  return raw


In [4]:

# Iterate through all JSON-LD files in the folder
langs = ["de","en"]
core_all = ["tib-core-subjects","all-subjects"]
for lang in langs:
  all_data = []
  for fold in core_all:
    for data_type in ["train", "dev"]:
      for folder in data_folders :
        data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", fold, "data", data_type, folder, lang)
        if not os.path.exists(data_folder):
          raise FileNotFoundError(f"The folder {data_folder} does not exist.")
        for i, file_name in enumerate(os.listdir(data_folder)):
            if file_name.endswith(".jsonld"):
                if (i % 100 == 0):
                  print(f"Processing file {i}: {file_name}")
                file_path = os.path.join(data_folder, file_name)
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        # Load JSON-LD data
                        json_data = json.load(f)
                        preprocessed_data = get_raw_dict_book(json_data, file_name, folder)
                        all_data.append(preprocessed_data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding {file_name}: {e}")
      dataframe = pd.DataFrame(all_data)
      output_file = f"tibkat_{lang}_{fold}_{data_type}.csv"
      dataframe.to_csv(output_file, index=False)

      print(f"Data saved to {output_file}")


Processing file 0: 3A1762285517.jsonld
Processing file 0: 3A39302699X.jsonld
Processing file 100: 3A1609194810.jsonld
Processing file 200: 3A390316334.jsonld
Processing file 300: 3A1645123413.jsonld
Processing file 400: 3A805384154.jsonld
Processing file 500: 3A365064793.jsonld
Processing file 600: 3A1654800503.jsonld
Processing file 700: 3A372975062.jsonld
Processing file 800: 3A1697216501.jsonld
Processing file 900: 3A877250960.jsonld
Processing file 1000: 3A1733582819.jsonld
Processing file 1100: 3A23415151X.jsonld
Processing file 1200: 3A386118108.jsonld
Processing file 1300: 3A175493398X.jsonld
Processing file 1400: 3A1047010763.jsonld
Processing file 1500: 3A137902522.jsonld
Processing file 1600: 3A1811678351.jsonld
Processing file 1700: 3A01682380X.jsonld
Processing file 1800: 3A1656172682.jsonld
Processing file 1900: 3A1650766467.jsonld
Processing file 2000: 3A1689107863.jsonld
Processing file 2100: 3A625362977.jsonld
Processing file 2200: 3A1657298752.jsonld
Processing file 23

In [5]:
df_train = pd.read_csv("tibkat_en_all-subjects_train.csv")
df_train.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A1831633086.jsonld,Article,Chapter 44 Multiple Market Systems and the Cla...,"Theoretically, it is not the case that markets...",[],North Holland,"['gnd:4252654-1', 'gnd:4066528-8', 'gnd:412447...","['Experimentelle Wirtschaftsforschung', 'Wirts..."
1,3A1735134163.jsonld,Article,Adapted Process Model for Manufacturing Within...,The paper concentrates on the external elimina...,[],Springer,['gnd:4329079-6'],['Fertigungstechnik']
2,3A729995739.jsonld,Article,Funding regulations and risk sharing,This paper provides a description of the risk ...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4068097-6', 'gnd:407378...","['Zeitschrift', 'Zukunft', 'Kreditmarkt']"
3,3A1831633485.jsonld,Article,Chapter 4 Zero-Intelligence Robots and the Dou...,This chapter illustrates the behavior of Zero-...,[],North Holland,"['gnd:4139716-2', 'gnd:4124477-1', 'gnd:401599...","['Methodologie', 'Wirtschaftsforschung', 'Expe..."
4,3A730008444.jsonld,Article,Migration to and from Russia and South-East Eu...,Whereas Germany and Austria were the main dest...,[],OECD Publishing,"['gnd:4113450-3', 'gnd:4066399-1', 'gnd:406649...","['Entwicklung', 'Wirtschaft', 'Wirtschaftspoli..."


In [6]:
df_dev = pd.read_csv("tibkat_en_all-subjects_dev.csv")
df_dev.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A1831633086.jsonld,Article,Chapter 44 Multiple Market Systems and the Cla...,"Theoretically, it is not the case that markets...",[],North Holland,"['gnd:4252654-1', 'gnd:4066528-8', 'gnd:412447...","['Experimentelle Wirtschaftsforschung', 'Wirts..."
1,3A1735134163.jsonld,Article,Adapted Process Model for Manufacturing Within...,The paper concentrates on the external elimina...,[],Springer,['gnd:4329079-6'],['Fertigungstechnik']
2,3A729995739.jsonld,Article,Funding regulations and risk sharing,This paper provides a description of the risk ...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4068097-6', 'gnd:407378...","['Zeitschrift', 'Zukunft', 'Kreditmarkt']"
3,3A1831633485.jsonld,Article,Chapter 4 Zero-Intelligence Robots and the Dou...,This chapter illustrates the behavior of Zero-...,[],North Holland,"['gnd:4139716-2', 'gnd:4124477-1', 'gnd:401599...","['Methodologie', 'Wirtschaftsforschung', 'Expe..."
4,3A730008444.jsonld,Article,Migration to and from Russia and South-East Eu...,Whereas Germany and Austria were the main dest...,[],OECD Publishing,"['gnd:4113450-3', 'gnd:4066399-1', 'gnd:406649...","['Entwicklung', 'Wirtschaft', 'Wirtschaftspoli..."


In [16]:

with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as f:
    all_gnds = json.load(f)
all_gnds = pd.DataFrame(all_gnds)
all_gnds.head()



Unnamed: 0,Code,Classification Number,Classification Name,Name,Alternate Name,Related Subjects,Source,Definition,Source URL
0,gnd:4003694-7,0,Unspezifische Allgemeinwörter,Ausbreitung,[],[],Du.,,
1,gnd:4032393-6,0,Unspezifische Allgemeinwörter,Koordination,[Koordinierung],[],M 1.,Verknüpfe mit Anwendungsgebiet,
2,gnd:4038971-6,0,Unspezifische Allgemeinwörter,Methode,"[Methodik, Verfahren,Methode, Technik,Methode,...",[Methodologie],M,,
3,gnd:4043744-9,0,Unspezifische Allgemeinwörter,Ordnung,[],"[Unordnung, Ordnen]",M 1.,"Allgemeinbegriff, verknüpfe mit Anwendungsgebiet",
4,gnd:4048300-9,0,Unspezifische Allgemeinwörter,Rahmen,[],[],M,"Etwas, was einer Sache ein bestimmtes (äußere...",


In [35]:
from tqdm import tqdm
import pandas as pd

# Create a fast lookup dictionary
code_to_classification = all_gnds.set_index("Code")["Classification Number"].to_dict()

def get_classification_numbers(dc_subject):
    """Convert subject string into a list of classification numbers"""
    subjects = dc_subject.replace("[", "").replace("]", "").replace("'", "").split(",")
    return [code_to_classification.get(subject.strip(), None) for subject in subjects]

# Use apply() for vectorized processing
tqdm.pandas()
df_train["classification_numbers"] = df_train["dcterms:subject"].progress_apply(get_classification_numbers)

df_train.head()

100%|██████████| 64543/64543 [00:00<00:00, 131123.36it/s]


Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,classification_numbers
0,3A1831633086.jsonld,Article,Chapter 44 Multiple Market Systems and the Cla...,"Theoretically, it is not the case that markets...",[],North Holland,"['gnd:4252654-1', 'gnd:4066528-8', 'gnd:412447...","['Experimentelle Wirtschaftsforschung', 'Wirts...","[10.2aa, 10.2a, 10.2aa, 4.3, 18]"
1,3A1735134163.jsonld,Article,Adapted Process Model for Manufacturing Within...,The paper concentrates on the external elimina...,[],Springer,['gnd:4329079-6'],['Fertigungstechnik'],[31.8a]
2,3A729995739.jsonld,Article,Funding regulations and risk sharing,This paper provides a description of the risk ...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4068097-6', 'gnd:407378...","['Zeitschrift', 'Zukunft', 'Kreditmarkt']","[2.3, 1, 10.9c]"
3,3A1831633485.jsonld,Article,Chapter 4 Zero-Intelligence Robots and the Dou...,This chapter illustrates the behavior of Zero-...,[],North Holland,"['gnd:4139716-2', 'gnd:4124477-1', 'gnd:401599...","['Methodologie', 'Wirtschaftsforschung', 'Expe...","[4.3, 10.2aa, 18, 10.2aa, 10.2a]"
4,3A730008444.jsonld,Article,Migration to and from Russia and South-East Eu...,Whereas Germany and Austria were the main dest...,[],OECD Publishing,"['gnd:4113450-3', 'gnd:4066399-1', 'gnd:406649...","['Entwicklung', 'Wirtschaft', 'Wirtschaftspoli...","[1, 10.2a, 10.4]"


In [36]:
df_dev["classification_numbers"] = df_dev["dcterms:subject"].progress_apply(get_classification_numbers)

df_dev.head()

100%|██████████| 70588/70588 [00:00<00:00, 248542.90it/s]


Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,classification_numbers
0,3A1831633086.jsonld,Article,Chapter 44 Multiple Market Systems and the Cla...,"Theoretically, it is not the case that markets...",[],North Holland,"['gnd:4252654-1', 'gnd:4066528-8', 'gnd:412447...","['Experimentelle Wirtschaftsforschung', 'Wirts...","[10.2aa, 10.2a, 10.2aa, 4.3, 18]"
1,3A1735134163.jsonld,Article,Adapted Process Model for Manufacturing Within...,The paper concentrates on the external elimina...,[],Springer,['gnd:4329079-6'],['Fertigungstechnik'],[31.8a]
2,3A729995739.jsonld,Article,Funding regulations and risk sharing,This paper provides a description of the risk ...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4068097-6', 'gnd:407378...","['Zeitschrift', 'Zukunft', 'Kreditmarkt']","[2.3, 1, 10.9c]"
3,3A1831633485.jsonld,Article,Chapter 4 Zero-Intelligence Robots and the Dou...,This chapter illustrates the behavior of Zero-...,[],North Holland,"['gnd:4139716-2', 'gnd:4124477-1', 'gnd:401599...","['Methodologie', 'Wirtschaftsforschung', 'Expe...","[4.3, 10.2aa, 18, 10.2aa, 10.2a]"
4,3A730008444.jsonld,Article,Migration to and from Russia and South-East Eu...,Whereas Germany and Austria were the main dest...,[],OECD Publishing,"['gnd:4113450-3', 'gnd:4066399-1', 'gnd:406649...","['Entwicklung', 'Wirtschaft', 'Wirtschaftspoli...","[1, 10.2a, 10.4]"


In [38]:
gnd_descriptions = pd.read_csv("gnd_subject_names_descriptions.csv")
gnd_descriptions.head()

Unnamed: 0,Code,Name,TIB Core,German Description,English Description
0,0.0,Unspezifische Allgemeinwörter,True,'Unspezifische Allgemeinwörter' bezeichnen Wör...,'Unspezifische Allgemeinwörter' (non-specific ...
1,1.0,"Allgemeines, Interdisziplinäre Allgemeinwörter",True,"'Allgemeines, Interdisziplinäre Allgemeinwörte...","'General, Interdisciplinary General Knowledge'..."
2,2.1,"Schrift, Handschriftenkunde",False,"'Schrift, Handschriftenkunde' ist ein Fachgebi...","'Schrift, Handschriftenkunde' (Paleography and..."
3,2.2,"Buchwissenschaft, Buchhandel",False,Buchwissenschaft und Buchhandel sind interdisz...,Book studies and book trade are interdisciplin...
4,2.3,Presse,False,Das Fach 'Presse' befasst sich mit der Erforsc...,The subject 'Press' deals with the study and a...


In [37]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the tokenizer and model
MODEL_NAME = "xlm-roberta-base"  # Can use "xlm-roberta-large" for better accuracy
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
model = XLMRobertaModel.from_pretrained(MODEL_NAME)
model.eval()  # Set model to evaluation mode


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine

In [39]:
def get_embedding(text):
    """Convert text into an embedding using XLM-RoBERTa"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=32, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation


# Embed all subjects
subjects = gnd_descriptions["English Description"].tolist()
subject_embeddings = np.array([get_embedding(subj) for subj in subjects])


In [43]:
import torch
import numpy as np
from tqdm import tqdm

batch_size = 32  # Adjust based on GPU memory

def batch_embed(texts, batch_size=32):
    embeddings = []
    num_batches = len(texts) // batch_size + int(len(texts) % batch_size != 0)  # Total batches

    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Books", unit="batch"):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = get_embedding(batch_texts)  # Ensure `get_embedding` supports batch input
        embeddings.extend(batch_embeddings)

    return np.array(embeddings)

# Create a full-text column (if not already done)
df_train["full_text"] = df_train["title"] + " " + df_train["abstract"]

# Compute embeddings with progress tracking
book_embeddings = batch_embed(df_train["full_text"].tolist(), batch_size=batch_size)



Embedding Books:  19%|█▉        | 383/2017 [24:31<1:44:38,  3.84s/batch]


KeyboardInterrupt: 