In [1]:
# Import necessary libraries
import os
import pandas as pd
import json
from git import Repo

# Clone the GitHub repository
git_url = "https://github.com/jd-coderepos/llms4subjects.git"
repo_dir = "llms4subjects"

if not os.path.exists(repo_dir):
    print("Cloning repository...")
    Repo.clone_from(git_url, repo_dir)
else:
    print("Repository already cloned.")

# Path to the target folder
data_folders = ["Article", "Book", "Conference", "Report", "Thesis"]
#data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", "tib-core-subjects", "data", "train", "Book", "en")

# Verify the folder exists
#if not os.path.exists(data_folder):
#    raise FileNotFoundError(f"The folder {data_folder} does not exist.")


Cloning repository...


In [2]:
def get_raw_dict_book(book, file_name, folder):
  raw = {}
  data = {}
  label = {}
  for item in book["@graph"]:
    if "title" in item:
      data["file_name"] = file_name
      data["folder"] = folder
      data["title"] = item["title"]
      data["abstract"] = item["abstract"]
      if ("creator" in item):
        gnd_creator = []
        if (isinstance(item["creator"], list)):
          gnd_creator = item["creator"]
        else:
          gnd_creator = [item["creator"]]
        name_creator = []
        for gnd in gnd_creator:
          for finder in book["@graph"]:
            if "@id" in finder and finder["@id"] == gnd:
              name_creator.append(finder["sameAs"])
        data["creator"] = name_creator
      if ("publisher" in item):
        data["publisher"] = item["publisher"]
      if ("@id" in item["dcterms:subject"]):
        item["dcterms:subject"] = [item["dcterms:subject"]]
      label["dcterms:subject"] = [x["@id"] for x in item["dcterms:subject"]]
      dcterms_name = []
      for dcterms in label["dcterms:subject"]:
        for finder in book["@graph"]:
          if "@id" in finder and finder["@id"] == dcterms:
            dcterms_name.append(finder["sameAs"])
      label["dcterms:subject_name"] = dcterms_name
      raw.update(data)
      raw.update(label)
  return raw


In [3]:

# Iterate through all JSON-LD files in the folder
langs = ["de","en"]
core_all = ["tib-core-subjects","all-subjects"]
for lang in langs:
  all_data = []
  for fold in core_all:
    for data_type in ["train", "dev"]:
      for folder in data_folders :
        data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", fold, "data", data_type, folder, lang)
        if not os.path.exists(data_folder):
          raise FileNotFoundError(f"The folder {data_folder} does not exist.")
        for i, file_name in enumerate(os.listdir(data_folder)):
            if file_name.endswith(".jsonld"):
                if (i % 100 == 0):
                  print(f"Processing file {i}: {file_name}")
                file_path = os.path.join(data_folder, file_name)
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        # Load JSON-LD data
                        json_data = json.load(f)
                        preprocessed_data = get_raw_dict_book(json_data, file_name, folder)
                        all_data.append(preprocessed_data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding {file_name}: {e}")
      dataframe = pd.DataFrame(all_data)
      output_file = f"tibkat_{lang}_{fold}_{data_type}.csv"
      dataframe.to_csv(output_file, index=False)

      print(f"Data saved to {output_file}")


Processing file 0: 3A1813386471.jsonld
Processing file 0: 3A1656748185.jsonld
Processing file 100: 3A168181790X.jsonld
Processing file 200: 3A1750020599.jsonld
Processing file 300: 3A1645346234.jsonld
Processing file 400: 3A1656447363.jsonld
Processing file 500: 3A1658607791.jsonld
Processing file 600: 3A1026847265.jsonld
Processing file 700: 3A52223867X.jsonld
Processing file 800: 3A1806587785.jsonld
Processing file 900: 3A522901069.jsonld
Processing file 1000: 3A1677102055.jsonld
Processing file 1100: 3A1645378624.jsonld
Processing file 1200: 3A1783617934.jsonld
Processing file 1300: 3A372473717.jsonld
Processing file 1400: 3A1658881427.jsonld
Processing file 1500: 3A1657198294.jsonld
Processing file 1600: 3A1691529443.jsonld
Processing file 1700: 3A27724823X.jsonld
Processing file 1800: 3A1651958025.jsonld
Processing file 1900: 3A386653097.jsonld
Processing file 2000: 3A1646025954.jsonld
Processing file 2100: 3A1772251046.jsonld
Processing file 2200: 3A1656168367.jsonld
Processing f

In [2]:
df_train = pd.read_csv("tibkat_en_all-subjects_train.csv")
df_train.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A1831649586.jsonld,Article,Chapter 19. Forecasting Binary Outcomes,Binary events are involved in many economic de...,[],Elsevier North Holland,"['gnd:4358095-6', 'gnd:4132280-0', 'gnd:405978...","['Prognoseverfahren', 'Ökonometrie', 'Theorie'..."
1,3A1831632608.jsonld,Article,Chapter 92 Endogenous Properties of Equilibriu...,This chapter discusses that experiments demons...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:401599...","['Methodologie', 'Experimentelle Wirtschaftsfo..."
2,3A1831652579.jsonld,Article,Chapter 10. Environmental Risk and Uncertainty,Environmental risks may comprise the most impo...,[],"Elsevier, North-Holland","['gnd:4186957-6', 'gnd:4050133-4', 'gnd:413559...","['Unsicherheit', 'Risikoverhalten', 'Risikothe..."
3,3A1831649969.jsonld,Article,Chapter 24. Market Structure in Multisector Ge...,We provide an overview of several approaches t...,[],"North-Holland, Elsevier","['gnd:4210294-7', 'gnd:4224214-9', 'gnd:406649...","['Allgemeines Gleichgewichtsmodell', 'Wirkungs..."
4,3A1831640694.jsonld,Article,Chapter 12. Heterogeneity and Networks,This chapter shows that networks can have larg...,[],North Holland,"['gnd:4148259-1', 'gnd:4132280-0', 'gnd:411273...","['Computersimulation', 'Ökonometrie', 'Wirtsch..."


In [3]:
df_dev = pd.read_csv("tibkat_en_all-subjects_dev.csv")
df_dev.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A1831649586.jsonld,Article,Chapter 19. Forecasting Binary Outcomes,Binary events are involved in many economic de...,[],Elsevier North Holland,"['gnd:4358095-6', 'gnd:4132280-0', 'gnd:405978...","['Prognoseverfahren', 'Ökonometrie', 'Theorie'..."
1,3A1831632608.jsonld,Article,Chapter 92 Endogenous Properties of Equilibriu...,This chapter discusses that experiments demons...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:401599...","['Methodologie', 'Experimentelle Wirtschaftsfo..."
2,3A1831652579.jsonld,Article,Chapter 10. Environmental Risk and Uncertainty,Environmental risks may comprise the most impo...,[],"Elsevier, North-Holland","['gnd:4186957-6', 'gnd:4050133-4', 'gnd:413559...","['Unsicherheit', 'Risikoverhalten', 'Risikothe..."
3,3A1831649969.jsonld,Article,Chapter 24. Market Structure in Multisector Ge...,We provide an overview of several approaches t...,[],"North-Holland, Elsevier","['gnd:4210294-7', 'gnd:4224214-9', 'gnd:406649...","['Allgemeines Gleichgewichtsmodell', 'Wirkungs..."
4,3A1831640694.jsonld,Article,Chapter 12. Heterogeneity and Networks,This chapter shows that networks can have larg...,[],North Holland,"['gnd:4148259-1', 'gnd:4132280-0', 'gnd:411273...","['Computersimulation', 'Ökonometrie', 'Wirtsch..."


In [5]:
import json
import pandas as pd
with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as f:
    all_gnds = json.load(f)
all_gnds = pd.DataFrame(all_gnds)
all_gnds.head()



Unnamed: 0,Code,Classification Number,Classification Name,Name,Alternate Name,Related Subjects,Source,Definition,Source URL
0,gnd:4003694-7,0,Unspezifische Allgemeinwörter,Ausbreitung,[],[],Du.,,
1,gnd:4032393-6,0,Unspezifische Allgemeinwörter,Koordination,[Koordinierung],[],M 1.,Verknüpfe mit Anwendungsgebiet,
2,gnd:4038971-6,0,Unspezifische Allgemeinwörter,Methode,"[Methodik, Verfahren,Methode, Technik,Methode,...",[Methodologie],M,,
3,gnd:4043744-9,0,Unspezifische Allgemeinwörter,Ordnung,[],"[Unordnung, Ordnen]",M 1.,"Allgemeinbegriff, verknüpfe mit Anwendungsgebiet",
4,gnd:4048300-9,0,Unspezifische Allgemeinwörter,Rahmen,[],[],M,"Etwas, was einer Sache ein bestimmtes (äußere...",


In [6]:
from tqdm import tqdm
import pandas as pd

# Create a fast lookup dictionary
code_to_classification = all_gnds.set_index("Code")["Classification Number"].to_dict()

def get_classification_numbers(dc_subject):
    """Convert subject string into a list of classification numbers"""
    subjects = dc_subject.replace("[", "").replace("]", "").replace("'", "").split(",")
    return [code_to_classification.get(subject.strip(), None) for subject in subjects]

# Use apply() for vectorized processing
tqdm.pandas()
df_train["classification_numbers"] = df_train["dcterms:subject"].progress_apply(get_classification_numbers)

df_train.head()

100%|██████████| 64543/64543 [00:00<00:00, 108847.84it/s]


Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,classification_numbers
0,3A1831649586.jsonld,Article,Chapter 19. Forecasting Binary Outcomes,Binary events are involved in many economic de...,[],Elsevier North Holland,"['gnd:4358095-6', 'gnd:4132280-0', 'gnd:405978...","['Prognoseverfahren', 'Ökonometrie', 'Theorie'...","[29, 10.2ac, 4.3, 10.2a, 6.5]"
1,3A1831632608.jsonld,Article,Chapter 92 Endogenous Properties of Equilibriu...,This chapter discusses that experiments demons...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:401599...","['Methodologie', 'Experimentelle Wirtschaftsfo...","[4.3, 10.2aa, 18, 10.2aa, 10.2a]"
2,3A1831652579.jsonld,Article,Chapter 10. Environmental Risk and Uncertainty,Environmental risks may comprise the most impo...,[],"Elsevier, North-Holland","['gnd:4186957-6', 'gnd:4050133-4', 'gnd:413559...","['Unsicherheit', 'Risikoverhalten', 'Risikothe...","[1, 10.11b, 10.2ac, 10.2ac, 10.2ac]"
3,3A1831649969.jsonld,Article,Chapter 24. Market Structure in Multisector Ge...,We provide an overview of several approaches t...,[],"North-Holland, Elsevier","['gnd:4210294-7', 'gnd:4224214-9', 'gnd:406649...","['Allgemeines Gleichgewichtsmodell', 'Wirkungs...","[10.2da, 1, 10.4, 29, 4.4, 10.2aa]"
4,3A1831640694.jsonld,Article,Chapter 12. Heterogeneity and Networks,This chapter shows that networks can have larg...,[],North Holland,"['gnd:4148259-1', 'gnd:4132280-0', 'gnd:411273...","['Computersimulation', 'Ökonometrie', 'Wirtsch...","[30, 10.2ac, 10.11b]"


In [7]:
df_dev["classification_numbers"] = df_dev["dcterms:subject"].progress_apply(get_classification_numbers)

df_dev.head()

100%|██████████| 70588/70588 [00:00<00:00, 344806.24it/s]


Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,classification_numbers
0,3A1831649586.jsonld,Article,Chapter 19. Forecasting Binary Outcomes,Binary events are involved in many economic de...,[],Elsevier North Holland,"['gnd:4358095-6', 'gnd:4132280-0', 'gnd:405978...","['Prognoseverfahren', 'Ökonometrie', 'Theorie'...","[29, 10.2ac, 4.3, 10.2a, 6.5]"
1,3A1831632608.jsonld,Article,Chapter 92 Endogenous Properties of Equilibriu...,This chapter discusses that experiments demons...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:401599...","['Methodologie', 'Experimentelle Wirtschaftsfo...","[4.3, 10.2aa, 18, 10.2aa, 10.2a]"
2,3A1831652579.jsonld,Article,Chapter 10. Environmental Risk and Uncertainty,Environmental risks may comprise the most impo...,[],"Elsevier, North-Holland","['gnd:4186957-6', 'gnd:4050133-4', 'gnd:413559...","['Unsicherheit', 'Risikoverhalten', 'Risikothe...","[1, 10.11b, 10.2ac, 10.2ac, 10.2ac]"
3,3A1831649969.jsonld,Article,Chapter 24. Market Structure in Multisector Ge...,We provide an overview of several approaches t...,[],"North-Holland, Elsevier","['gnd:4210294-7', 'gnd:4224214-9', 'gnd:406649...","['Allgemeines Gleichgewichtsmodell', 'Wirkungs...","[10.2da, 1, 10.4, 29, 4.4, 10.2aa]"
4,3A1831640694.jsonld,Article,Chapter 12. Heterogeneity and Networks,This chapter shows that networks can have larg...,[],North Holland,"['gnd:4148259-1', 'gnd:4132280-0', 'gnd:411273...","['Computersimulation', 'Ökonometrie', 'Wirtsch...","[30, 10.2ac, 10.11b]"


In [8]:
gnd_descriptions = pd.read_csv("gnd_subject_names_descriptions.csv")
gnd_descriptions.head()

Unnamed: 0,Code,Name,TIB Core,German Description,English Description
0,0.0,Unspezifische Allgemeinwörter,True,'Unspezifische Allgemeinwörter' bezeichnen Wör...,'Unspezifische Allgemeinwörter' (non-specific ...
1,1.0,"Allgemeines, Interdisziplinäre Allgemeinwörter",True,"'Allgemeines, Interdisziplinäre Allgemeinwörte...","'General, Interdisciplinary General Knowledge'..."
2,2.1,"Schrift, Handschriftenkunde",False,"'Schrift, Handschriftenkunde' ist ein Fachgebi...","'Schrift, Handschriftenkunde' (Paleography and..."
3,2.2,"Buchwissenschaft, Buchhandel",False,Buchwissenschaft und Buchhandel sind interdisz...,Book studies and book trade are interdisciplin...
4,2.3,Presse,False,Das Fach 'Presse' befasst sich mit der Erforsc...,The subject 'Press' deals with the study and a...


In [10]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the tokenizer and model
MODEL_NAME = "xlm-roberta-base"  # Can use "xlm-roberta-large" for better accuracy
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
model = XLMRobertaModel.from_pretrained(MODEL_NAME)
model.eval()  # Set model to evaluation mode


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine

In [11]:
def get_embedding(text):
    """Convert text into an embedding using XLM-RoBERTa"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=32, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation


# Embed all subjects
subjects = gnd_descriptions["English Description"].tolist()
subject_embeddings = np.array([get_embedding(subj) for subj in subjects])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

def get_top_k_subjects(book_embedding, subject_embeddings, subject_df, k=5):
    """Find the top-k closest subjects for a given book embedding."""
    similarities = cosine_similarity([book_embedding], subject_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Top-k indices in descending order
    return [(subject_df.iloc[i]["Name"], subject_df.iloc[i]["Code"], similarities[i]) for i in top_k_indices]

def embed_and_find_closest_subjects(df_train, gnd_descriptions, k=5):
    # Embed all subjects once
    subject_texts = gnd_descriptions["English Description"].tolist()
    subject_embeddings = np.array([get_embedding(text) for text in tqdm(subject_texts, desc="Embedding Subjects")])

    top_subjects_per_book = []
    book_embeddings = []
    # Process each book
    for _, row in tqdm(df_train.iterrows(), total=df_train.shape[0], desc="Processing Books"):
        full_text = row["title"] + " " + row["abstract"]
        book_embedding = get_embedding(full_text)  # Embed the current book
        book_embeddings.append(book_embedding)
        top_subjects = get_top_k_subjects(book_embedding, subject_embeddings, gnd_descriptions, k)
        top_subjects_per_book.append(top_subjects)
        if len(top_subjects_per_book) % 100 == 0:
            print(f"Processed {len(top_subjects_per_book)} books")
            print(f"title: {row['title']}, abstract: {row['abstract']}")
            print(f"Top {k} subjects: {top_subjects}")
            print(book_embeddings[:5])  # Print first 5 book embeddings
            print(subject_embeddings[:5])  # Print first 5 subject embeddings

    df_train["top_subjects"] = top_subjects_per_book
    return df_train

# Run the process
df_train = embed_and_find_closest_subjects(df_train, gnd_descriptions, k=5)

# Display results
df_train[["title", "top_subjects"]].head()


In [19]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained Sentence Transformer model (optimized for multilingual similarity)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [11]:
import pandas as pd
import random
from sentence_transformers import InputExample

# Convert DataFrames into training pairs
train_data = []
all_subjects = gnd_descriptions["English Description"].tolist()  # All subjects for negatives

for _, row in df_train.iterrows():
    book_text = row["title"] + " " + row["abstract"]  # Combine title & abstract
    dc_subjects = row["dcterms:subject_name"]  # List of multiple correct subjects
    correct_subjects = dc_subjects.replace("[", "").replace("]", "").replace("'", "").split(",")
    # Add Positive Pairs (Correct Subjects)
    for subject in correct_subjects:
        train_data.append(InputExample(texts=[book_text, subject]))

    # Add Negative Pairs (Incorrect Subjects)
    num_negatives = min(3, len(all_subjects) - len(correct_subjects))  # Ensure valid negatives
    negative_subjects = random.sample([s for s in all_subjects if s not in correct_subjects], num_negatives)

    for neg_subject in negative_subjects:
        train_data.append(InputExample(texts=[book_text, neg_subject]))


In [None]:
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

# Load Pretrained Model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Convert data into DataLoader
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)

# Use MultipleNegativesRankingLoss for contrastive learning
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,  # Adjust as needed
    warmup_steps=100,
    show_progress_bar=True
)

# Save fine-tuned model
model.save("fine_tuned_book_subject_model")


Step,Training Loss
500,2.0206
1000,1.795
1500,1.7429
2000,1.7318
2500,1.6995
3000,1.6826
3500,1.6892
4000,1.6644
4500,1.6672
5000,1.6585


In [None]:
# Load fine-tuned model
model = SentenceTransformer("fine_tuned_book_subject_model")

# Encode Books
df_train["full_text"] = df_train["title"] + " " + df_train["abstract"]
book_embeddings = model.encode(df_train["full_text"].tolist(), show_progress_bar=True)

# Encode Subjects
subject_embeddings = model.encode(gnd_descriptions["description"].tolist(), show_progress_bar=True)

# Compute Similarity & Find Best Matches
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_k_subjects(book_embedding, subject_embeddings, subject_df, k=5):
    """Find the top-k closest subjects for a given book embedding."""
    similarities = cosine_similarity([book_embedding], subject_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]

    return [(subject_df.iloc[i]["name"], subject_df.iloc[i]["number"], similarities[i]) for i in top_k_indices]

# Get Top 5 Subjects for Each Book
df_train["top_subjects"] = [
    get_top_k_subjects(book_embeddings[i], subject_embeddings, gnd_descriptions)
    for i in range(len(book_embeddings))
]

# Show Results
df_train[["title", "top_subjects"]].head()