In [None]:
# Import necessary libraries
import os
import pandas as pd
import json
from git import Repo

# Clone the GitHub repository
git_url = "https://github.com/jd-coderepos/llms4subjects.git"
repo_dir = "llms4subjects"

if not os.path.exists(repo_dir):
    print("Cloning repository...")
    Repo.clone_from(git_url, repo_dir)
else:
    print("Repository already cloned.")

# Path to the target folder
data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", "tib-core-subjects", "data", "train", "Book", "en")

# Verify the folder exists
if not os.path.exists(data_folder):
    raise FileNotFoundError(f"The folder {data_folder} does not exist.")


Repository already cloned.


In [None]:
def get_raw_dict_book(book, file_name):
  raw = {}
  data = {}
  label = {}
  for item in book["@graph"]:
    if "title" in item:
      data["file_name"] = file_name
      data["title"] = item["title"]
      data["abstract"] = item["abstract"]
      if ("creator" in item):
        gnd_creator = []
        if (isinstance(item["creator"], list)):
          gnd_creator = item["creator"]
        else:
          gnd_creator = [item["creator"]]
        name_creator = []
        for gnd in gnd_creator:
          for finder in book["@graph"]:
            if "@id" in finder and finder["@id"] == gnd:
              name_creator.append(finder["sameAs"])
        data["creator"] = name_creator
      if ("publisher" in item):
        data["publisher"] = item["publisher"]
      if ("@id" in item["dcterms:subject"]):
        item["dcterms:subject"] = [item["dcterms:subject"]]
      label["dcterms:subject"] = [x["@id"] for x in item["dcterms:subject"]]
      dcterms_name = []
      for dcterms in label["dcterms:subject"]:
        for finder in book["@graph"]:
          if "@id" in finder and finder["@id"] == dcterms:
            dcterms_name.append(finder["sameAs"])
      label["dcterms:subject_name"] = dcterms_name
      raw.update(data)
      raw.update(label)
  return raw


In [None]:
all_data = []
# Iterate through all JSON-LD files in the folder
for i, file_name in enumerate(os.listdir(data_folder)):
    if file_name.endswith(".jsonld"):
        if (i % 100 == 0):
          print(f"Processing file {i}: {file_name}")
        file_path = os.path.join(data_folder, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                # Load JSON-LD data
                json_data = json.load(f)
                preprocessed_data = get_raw_dict_book(json_data, file_name)
                all_data.append(preprocessed_data)
            except json.JSONDecodeError as e:
                print(f"Error decoding {file_name}: {e}")


Processing file 0: 3A1651890803.jsonld
Processing file 100: 3A1773891405.jsonld
Processing file 200: 3A1759376485.jsonld
Processing file 300: 3A881110248.jsonld
Processing file 400: 3A1655439022.jsonld
Processing file 500: 3A270026681.jsonld
Processing file 600: 3A599204176.jsonld
Processing file 700: 3A1651945357.jsonld
Processing file 800: 3A1002768756.jsonld
Processing file 900: 3A830149295.jsonld
Processing file 1000: 3A1645532321.jsonld
Processing file 1100: 3A1751229890.jsonld
Processing file 1200: 3A161862704X.jsonld
Processing file 1300: 3A1769607722.jsonld
Processing file 1400: 3A1645934705.jsonld
Processing file 1500: 3A1819955753.jsonld
Processing file 1600: 3A525106227.jsonld
Processing file 1700: 3A1756965625.jsonld
Processing file 1800: 3A898697190.jsonld
Processing file 1900: 3A252540948.jsonld
Processing file 2000: 3A525803793.jsonld
Processing file 2100: 3A1765049679.jsonld
Processing file 2200: 3A1654914541.jsonld
Processing file 2300: 3A1841950742.jsonld
Processing f

In [None]:
dataframe = pd.DataFrame(all_data)

In [None]:
output_file = "output_data.csv"
dataframe.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")


Data saved to output_data.csv
