In [1]:
# Import necessary libraries
import os
import pandas as pd
import json
from git import Repo

# Clone the GitHub repository
git_url = "https://github.com/jd-coderepos/llms4subjects.git"
repo_dir = "llms4subjects"

if not os.path.exists(repo_dir):
    print("Cloning repository...")
    Repo.clone_from(git_url, repo_dir)
else:
    print("Repository already cloned.")

# Path to the target folder
data_folders = ["Article", "Book", "Conference", "Report", "Thesis"]
#data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", "tib-core-subjects", "data", "train", "Book", "en")

# Verify the folder exists
#if not os.path.exists(data_folder):
#    raise FileNotFoundError(f"The folder {data_folder} does not exist.")


Cloning repository...


In [2]:
def get_raw_dict_book(book, file_name, folder):
  raw = {}
  data = {}
  label = {}
  for item in book["@graph"]:
    if "title" in item:
      data["file_name"] = file_name
      data["folder"] = folder
      data["title"] = item["title"]
      data["abstract"] = item["abstract"]
      if ("creator" in item):
        gnd_creator = []
        if (isinstance(item["creator"], list)):
          gnd_creator = item["creator"]
        else:
          gnd_creator = [item["creator"]]
        name_creator = []
        for gnd in gnd_creator:
          for finder in book["@graph"]:
            if "@id" in finder and finder["@id"] == gnd:
              name_creator.append(finder["sameAs"])
        data["creator"] = name_creator
      if ("publisher" in item):
        data["publisher"] = item["publisher"]
      if ("@id" in item["dcterms:subject"]):
        item["dcterms:subject"] = [item["dcterms:subject"]]
      label["dcterms:subject"] = [x["@id"] for x in item["dcterms:subject"]]
      dcterms_name = []
      for dcterms in label["dcterms:subject"]:
        for finder in book["@graph"]:
          if "@id" in finder and finder["@id"] == dcterms:
            dcterms_name.append(finder["sameAs"])
      label["dcterms:subject_name"] = dcterms_name
      raw.update(data)
      raw.update(label)
  return raw


In [23]:

# Iterate through all JSON-LD files in the folder
langs = ["de","en"]
core_all = ["tib-core-subjects","all-subjects"]
for lang in langs:
  all_data = []
  for fold in core_all:
    for folder in data_folders :
      data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", fold, "data", "dev", folder, lang)
      if not os.path.exists(data_folder):
        raise FileNotFoundError(f"The folder {data_folder} does not exist.")
      for i, file_name in enumerate(os.listdir(data_folder)):
          if file_name.endswith(".jsonld"):
              if (i % 100 == 0):
                print(f"Processing file {i}: {file_name}")
              file_path = os.path.join(data_folder, file_name)
              with open(file_path, "r", encoding="utf-8") as f:
                  try:
                      # Load JSON-LD data
                      json_data = json.load(f)
                      preprocessed_data = get_raw_dict_book(json_data, file_name, folder)
                      all_data.append(preprocessed_data)
                  except json.JSONDecodeError as e:
                      print(f"Error decoding {file_name}: {e}")
    dataframe = pd.DataFrame(all_data)
    output_file = f"tibkat_{lang}_{fold}_dev.csv"
    dataframe.to_csv(output_file, index=False)

    print(f"Data saved to {output_file}")


Processing file 0: 3A175995943X.jsonld
Processing file 0: 3A169918481X.jsonld
Processing file 100: 3A1603625615.jsonld
Processing file 200: 3A1694017435.jsonld
Processing file 300: 3A392148943.jsonld
Processing file 400: 3A1645186733.jsonld
Processing file 500: 3A1030104921.jsonld
Processing file 600: 3A873723813.jsonld
Processing file 700: 3A1738993361.jsonld
Processing file 800: 3A1699136750.jsonld
Processing file 900: 3A488024714.jsonld
Processing file 1000: 3A1651691436.jsonld
Processing file 1100: 3A1697216986.jsonld
Processing file 1200: 3A1657537056.jsonld
Processing file 1300: 3A1681727978.jsonld
Processing file 1400: 3A1664587365.jsonld
Processing file 1500: 3A1724726595.jsonld
Processing file 1600: 3A1733467556.jsonld
Processing file 1700: 3A1659469716.jsonld
Processing file 1800: 3A278897452.jsonld
Processing file 1900: 3A155361368.jsonld
Processing file 2000: 3A1860643248.jsonld
Processing file 0: 3A187616025X.jsonld
Processing file 100: 3A160948360X.jsonld
Processing file 

In [18]:
dataframe = pd.DataFrame(all_data)

In [19]:
output_file = "tibkat_de_core_train.csv"
dataframe.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")


Data saved to tibkat_de_core_train.csv
