In [None]:
# Import necessary libraries
import os
import pandas as pd
import json
from git import Repo

# Clone the GitHub repository
git_url = "https://github.com/jd-coderepos/llms4subjects.git"
repo_dir = "llms4subjects"

if not os.path.exists(repo_dir):
    print("Cloning repository...")
    Repo.clone_from(git_url, repo_dir)
else:
    print("Repository already cloned.")

# Path to the target folder
data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", "tib-core-subjects", "data", "train", "Book", "en")

# Verify the folder exists
if not os.path.exists(data_folder):
    raise FileNotFoundError(f"The folder {data_folder} does not exist.")


Cloning repository...


In [None]:
def get_raw_dict_book(book):
  raw = {}
  data = {}
  label = {}
  for item in book["@graph"]:
    print(item)
    if "title" in item:
      data["title"] = item["title"]
      data["abstract"] = item["abstract"]
      data["creator"] = item["creator"]
      data["publisher"] = item["publisher"]
      label["dcterms:subject"] = item["dcterms:subject"]
      raw.update(data)
      raw.update(label)
  return raw


In [None]:
all_data = []
# Iterate through all JSON-LD files in the folder
for i, file_name in enumerate(os.listdir(data_folder)):
    if file_name.endswith(".jsonld"):
        print(f"Processing file {i}: {file_name}")
        if (i > 2):
          break
        file_path = os.path.join(data_folder, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                # Load JSON-LD data
                json_data = json.load(f)
                preprocessed_data = get_raw_dict_book(json_data)
                print(preprocessed_data)
                all_data.append(preprocessed_data)
            except json.JSONDecodeError as e:
                print(f"Error decoding {file_name}: {e}")


Processing file 0: 3A1651890803.jsonld
{'@id': 'gnd:1028575963', 'sameAs': 'Wipf, Andreas'}
{'@id': 'gnd:4047984-5', 'sameAs': 'Quantenfeldtheorie'}
{'@id': 'gnd:4057000-9', 'sameAs': 'Statistische Physik'}
{'@id': 'https://www.tib.eu/de/suchen/id/TIBKAT%3A1651890803', '@type': 'bibo:Book', 'P1053': 'Online-Ressource (XVIII, 390 p. 133 illus, digital)', 'description': 'Campusweiter Zugriff (Universität Hannover). - Vervielfältigungen (z.B. Kopien, Downloads) sind nur von einzelnen Kapiteln oder Seiten und nur zum eigenen wissenschaftlichen Gebrauch erlaubt. Keine Weitergabe an Dritte. Kein systematisches Downloaden durch Robots.', 'identifier': ['(doi)10.1007/978-3-642-33105-3', '(isbn13)9783642331053', '(firstid)BSZ:375375627', '(ppn)1651890803'], 'publisher': 'Springer', 'subject': ['(classificationName=linseach:mapping)mat', 'Physics', '(classificationName=msc)81S40', '(classificationName=ddc)530.143', 'Quantum theory', '(classificationName=rvk)UO 4000', '(classificationName=bk, id=

In [None]:
print(all_data)

[{'title': 'Statistical Approach to Quantum Field Theory : An Introduction', 'abstract': 'Over the past few decades the powerful methods of statistical physics and Euclidean quantum field theory have moved closer together, with common tools based on the use of path integrals. The interpretation of Euclidean field theories as particular systems of statistical physics has opened up new avenues for understanding strongly coupled quantum systems or quantum field theories at zero or finite temperatures. Accordingly, the first chapters of this book contain a self-contained introduction to path integrals in Euclidean quantum mechanics and statistical mechanics. The resulting high-dimensional integrals can be estimated with the help of Monte Carlo simulations based on Markov processes. The most commonly used algorithms are presented in detail so as to prepare the reader for the use of high-performance computers as an “experimental” tool for this burgeoning field of theoretical physics.Several 

In [None]:
dataframe = pd.DataFrame(all_data)

In [None]:
output_file = "output_data.csv"
dataframe.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")


Data saved to output_data.csv
