In [1]:
# Import necessary libraries
import os
import pandas as pd
import json
from git import Repo

# Clone the GitHub repository
git_url = "https://github.com/jd-coderepos/llms4subjects.git"
repo_dir = "llms4subjects"

if not os.path.exists(repo_dir):
    print("Cloning repository...")
    Repo.clone_from(git_url, repo_dir)
else:
    print("Repository already cloned.")

# Path to the target folder
data_folders = ["Article", "Book", "Conference", "Report", "Thesis"]
#data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", "tib-core-subjects", "data", "train", "Book", "en")

# Verify the folder exists
#if not os.path.exists(data_folder):
#    raise FileNotFoundError(f"The folder {data_folder} does not exist.")


Repository already cloned.


In [2]:
def get_raw_dict_book(book, file_name, folder):
  raw = {}
  data = {}
  label = {}
  for item in book["@graph"]:
    if "title" in item:
      data["file_name"] = file_name
      data["folder"] = folder
      data["title"] = item["title"]
      data["abstract"] = item["abstract"]
      if ("creator" in item):
        gnd_creator = []
        if (isinstance(item["creator"], list)):
          gnd_creator = item["creator"]
        else:
          gnd_creator = [item["creator"]]
        name_creator = []
        for gnd in gnd_creator:
          for finder in book["@graph"]:
            if "@id" in finder and finder["@id"] == gnd:
              name_creator.append(finder["sameAs"])
        data["creator"] = name_creator
      if ("publisher" in item):
        data["publisher"] = item["publisher"]
      if ("@id" in item["dcterms:subject"]):
        item["dcterms:subject"] = [item["dcterms:subject"]]
      label["dcterms:subject"] = [x["@id"] for x in item["dcterms:subject"]]
      dcterms_name = []
      for dcterms in label["dcterms:subject"]:
        for finder in book["@graph"]:
          if "@id" in finder and finder["@id"] == dcterms:
            dcterms_name.append(finder["sameAs"])
      label["dcterms:subject_name"] = dcterms_name
      raw.update(data)
      raw.update(label)
  return raw


In [3]:

# Iterate through all JSON-LD files in the folder
langs = ["de","en"]
core_all = ["tib-core-subjects","all-subjects"]
for lang in langs:
  all_data = []
  for fold in core_all:
    for data_type in ["train", "dev"]:
      for folder in data_folders :
        data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", fold, "data", data_type, folder, lang)
        if not os.path.exists(data_folder):
          raise FileNotFoundError(f"The folder {data_folder} does not exist.")
        for i, file_name in enumerate(os.listdir(data_folder)):
            if file_name.endswith(".jsonld"):
                #if (i % 100 == 0):
                  #print(f"Processing file {i}: {file_name}")
                file_path = os.path.join(data_folder, file_name)
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        # Load JSON-LD data
                        json_data = json.load(f)
                        preprocessed_data = get_raw_dict_book(json_data, file_name, folder)
                        all_data.append(preprocessed_data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding {file_name}: {e}")
      dataframe = pd.DataFrame(all_data)
      output_file = f"tibkat_{lang}_{fold}_{data_type}.csv"
      dataframe.to_csv(output_file, index=False)

      print(f"Data saved to {output_file}")


Data saved to tibkat_de_tib-core-subjects_train.csv
Data saved to tibkat_de_tib-core-subjects_dev.csv
Data saved to tibkat_de_all-subjects_train.csv
Data saved to tibkat_de_all-subjects_dev.csv
Data saved to tibkat_en_tib-core-subjects_train.csv
Data saved to tibkat_en_tib-core-subjects_dev.csv
Data saved to tibkat_en_all-subjects_train.csv
Data saved to tibkat_en_all-subjects_dev.csv


In [1]:
import os
import pandas as pd
import json
df_train_en = pd.read_csv("tibkat_en_all-subjects_train.csv")
df_train_de = pd.read_csv("tibkat_de_all-subjects_train.csv")
df_train = pd.concat([df_train_en, df_train_de], ignore_index=True)
df_train.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']"
1,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E..."
2,3A73000726X.jsonld,Article,Private pensions and the financial crisis : Ho...,The current economic and financial crisis has ...,[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']"
3,3A1831632845.jsonld,Article,Chapter 68 The Combinatorial Auction,This chapter summarizes a market mechanism for...,[],North Holland,"['gnd:4139716-2', 'gnd:4015999-1', 'gnd:412447...","['Methodologie', 'Experiment', 'Wirtschaftsfor..."
4,3A730041743.jsonld,Article,OECD Financial Outreach Activities in 2003,"Following the end of the “Cold War”, the OECD ...",[],OECD Publishing,"['gnd:4068097-6', 'gnd:4067488-5', 'gnd:407378...","['Zukunft', 'Zeitschrift', 'Kreditmarkt']"


In [2]:
df_subjects = pd.read_csv("/kaggle/input/title-descriptions/title_description.csv")
df_train  = df_train.merge(df_subjects, on="title", how="left")
df_train.head()

Unnamed: 0.1,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,Unnamed: 0,top_subjects
0,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']",67,"[('Bank', '10.9b', 0.21772431), ('Wirtschaftsp..."
1,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']",28409,"[('Bank', '10.9b', 0.21772431), ('Wirtschaftsp..."
2,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E...",127,"[('Wirtschaftspolitik', '10.4', 0.19230132), (..."
3,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E...",28690,"[('Wirtschaftspolitik', '10.4', 0.19230132), (..."
4,3A73000726X.jsonld,Article,Private pensions and the financial crisis : Ho...,The current economic and financial crisis has ...,[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']",223,"[('Wirtschaftspolitik', '10.4', 0.18782546), (..."


In [3]:
import ast
df_train["top_subjects"] = df_train["top_subjects"].apply(ast.literal_eval)
df_train.head()

Unnamed: 0.1,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,Unnamed: 0,top_subjects
0,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']",67,"[(Bank, 10.9b, 0.21772431), (Wirtschaftspoliti..."
1,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']",28409,"[(Bank, 10.9b, 0.21772431), (Wirtschaftspoliti..."
2,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E...",127,"[(Wirtschaftspolitik, 10.4, 0.19230132), (Lebe..."
3,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E...",28690,"[(Wirtschaftspolitik, 10.4, 0.19230132), (Lebe..."
4,3A73000726X.jsonld,Article,Private pensions and the financial crisis : Ho...,The current economic and financial crisis has ...,[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']",223,"[(Wirtschaftspolitik, 10.4, 0.18782546), (Bank..."


In [4]:
df_dev_en = pd.read_csv("tibkat_en_all-subjects_dev.csv")
df_dev_de = pd.read_csv("tibkat_de_all-subjects_dev.csv")
df_dev = pd.concat([df_dev_en, df_dev_de], ignore_index=True)
df_dev.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']"
1,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E..."
2,3A73000726X.jsonld,Article,Private pensions and the financial crisis : Ho...,The current economic and financial crisis has ...,[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']"
3,3A1831632845.jsonld,Article,Chapter 68 The Combinatorial Auction,This chapter summarizes a market mechanism for...,[],North Holland,"['gnd:4139716-2', 'gnd:4015999-1', 'gnd:412447...","['Methodologie', 'Experiment', 'Wirtschaftsfor..."
4,3A730041743.jsonld,Article,OECD Financial Outreach Activities in 2003,"Following the end of the “Cold War”, the OECD ...",[],OECD Publishing,"['gnd:4068097-6', 'gnd:4067488-5', 'gnd:407378...","['Zukunft', 'Zeitschrift', 'Kreditmarkt']"


In [5]:
df_train = df_train.drop_duplicates(subset=["title"])
df_train.head()

Unnamed: 0.1,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,Unnamed: 0,top_subjects
0,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']",67,"[(Bank, 10.9b, 0.21772431), (Wirtschaftspoliti..."
2,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E...",127,"[(Wirtschaftspolitik, 10.4, 0.19230132), (Lebe..."
4,3A73000726X.jsonld,Article,Private pensions and the financial crisis : Ho...,The current economic and financial crisis has ...,[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']",223,"[(Wirtschaftspolitik, 10.4, 0.18782546), (Bank..."
6,3A1831632845.jsonld,Article,Chapter 68 The Combinatorial Auction,This chapter summarizes a market mechanism for...,[],North Holland,"['gnd:4139716-2', 'gnd:4015999-1', 'gnd:412447...","['Methodologie', 'Experiment', 'Wirtschaftsfor...",82,"[(Wirtschaftspolitik, 10.4, 0.21392943), (Wirt..."
7,3A730041743.jsonld,Article,OECD Financial Outreach Activities in 2003,"Following the end of the “Cold War”, the OECD ...",[],OECD Publishing,"['gnd:4068097-6', 'gnd:4067488-5', 'gnd:407378...","['Zukunft', 'Zeitschrift', 'Kreditmarkt']",164,"[(Wirtschaftspolitik, 10.4, 0.33859536), (Bank..."


In [6]:
df_train.shape[0]

96353

In [7]:
import json
import pandas as pd
with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as f:
    all_gnds = json.load(f)
all_gnds = pd.DataFrame(all_gnds)
all_gnds.head()



Unnamed: 0,Code,Classification Number,Classification Name,Name,Alternate Name,Related Subjects,Source,Definition,Source URL
0,gnd:4003694-7,0,Unspezifische Allgemeinwörter,Ausbreitung,[],[],Du.,,
1,gnd:4032393-6,0,Unspezifische Allgemeinwörter,Koordination,[Koordinierung],[],M 1.,Verknüpfe mit Anwendungsgebiet,
2,gnd:4038971-6,0,Unspezifische Allgemeinwörter,Methode,"[Methodik, Verfahren,Methode, Technik,Methode,...",[Methodologie],M,,
3,gnd:4043744-9,0,Unspezifische Allgemeinwörter,Ordnung,[],"[Unordnung, Ordnen]",M 1.,"Allgemeinbegriff, verknüpfe mit Anwendungsgebiet",
4,gnd:4048300-9,0,Unspezifische Allgemeinwörter,Rahmen,[],[],M,"Etwas, was einer Sache ein bestimmtes (äußere...",


In [8]:
from tqdm import tqdm
import pandas as pd

# Create a fast lookup dictionary
code_to_classification = all_gnds.set_index("Code")["Classification Number"].to_dict()

def get_classification_numbers(dc_subject):
    """Convert subject string into a list of classification numbers"""
    subjects = dc_subject.replace("[", "").replace("]", "").replace("'", "").split(",")
    return [code_to_classification.get(subject.strip(), None) for subject in subjects]

# Use apply() for vectorized processing
tqdm.pandas()
df_train["classification_numbers"] = df_train["dcterms:subject"].progress_apply(get_classification_numbers)

df_train.head()

100%|██████████| 96353/96353 [00:00<00:00, 339892.69it/s]


Unnamed: 0.1,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,Unnamed: 0,top_subjects,classification_numbers
0,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']",67,"[(Bank, 10.9b, 0.21772431), (Wirtschaftspoliti...","[2.3, 10.9c, 1]"
2,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E...",127,"[(Wirtschaftspolitik, 10.4, 0.19230132), (Lebe...","[18, 10.2a, 10.2aa, 10.2aa, 4.3]"
4,3A73000726X.jsonld,Article,Private pensions and the financial crisis : Ho...,The current economic and financial crisis has ...,[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']",223,"[(Wirtschaftspolitik, 10.4, 0.18782546), (Bank...","[10.9c, 1, 2.3]"
6,3A1831632845.jsonld,Article,Chapter 68 The Combinatorial Auction,This chapter summarizes a market mechanism for...,[],North Holland,"['gnd:4139716-2', 'gnd:4015999-1', 'gnd:412447...","['Methodologie', 'Experiment', 'Wirtschaftsfor...",82,"[(Wirtschaftspolitik, 10.4, 0.21392943), (Wirt...","[4.3, 18, 10.2aa, 10.2aa, 10.2a]"
7,3A730041743.jsonld,Article,OECD Financial Outreach Activities in 2003,"Following the end of the “Cold War”, the OECD ...",[],OECD Publishing,"['gnd:4068097-6', 'gnd:4067488-5', 'gnd:407378...","['Zukunft', 'Zeitschrift', 'Kreditmarkt']",164,"[(Wirtschaftspolitik, 10.4, 0.33859536), (Bank...","[1, 2.3, 10.9c]"


In [9]:
df_dev["classification_numbers"] = df_dev["dcterms:subject"].progress_apply(get_classification_numbers)

df_dev.head()

100%|██████████| 144485/144485 [00:00<00:00, 168380.63it/s]


Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,classification_numbers
0,3A730002071.jsonld,Article,New challenges in the use of governement debt ...,Government debt issuance procedures and polici...,[],OECD Publishing,"['gnd:4067488-5', 'gnd:4073788-3', 'gnd:406809...","['Zeitschrift', 'Kreditmarkt', 'Zukunft']","[2.3, 10.9c, 1]"
1,3A1831632497.jsonld,Article,Chapter 102 The Becker–DeGroot–Marschak Mechan...,Experimentalists have been so sure of the ince...,[],North Holland,"['gnd:4015999-1', 'gnd:4066528-8', 'gnd:425265...","['Experiment', 'Wirtschaftswissenschaften', 'E...","[18, 10.2a, 10.2aa, 10.2aa, 4.3]"
2,3A73000726X.jsonld,Article,Private pensions and the financial crisis : Ho...,The current economic and financial crisis has ...,[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']","[10.9c, 1, 2.3]"
3,3A1831632845.jsonld,Article,Chapter 68 The Combinatorial Auction,This chapter summarizes a market mechanism for...,[],North Holland,"['gnd:4139716-2', 'gnd:4015999-1', 'gnd:412447...","['Methodologie', 'Experiment', 'Wirtschaftsfor...","[4.3, 18, 10.2aa, 10.2aa, 10.2a]"
4,3A730041743.jsonld,Article,OECD Financial Outreach Activities in 2003,"Following the end of the “Cold War”, the OECD ...",[],OECD Publishing,"['gnd:4068097-6', 'gnd:4067488-5', 'gnd:407378...","['Zukunft', 'Zeitschrift', 'Kreditmarkt']","[1, 2.3, 10.9c]"


In [10]:
gnd_descriptions = pd.read_csv("/kaggle/input/gnd-subject-names-descriptions-csv/gnd_subject_names_descriptions.csv")
gnd_descriptions.head()

Unnamed: 0,Code,Name,TIB Core,German Description,English Description
0,0.0,Unspezifische Allgemeinwörter,True,'Unspezifische Allgemeinwörter' bezeichnen Wör...,'Unspezifische Allgemeinwörter' (non-specific ...
1,1.0,"Allgemeines, Interdisziplinäre Allgemeinwörter",True,"'Allgemeines, Interdisziplinäre Allgemeinwörte...","'General, Interdisciplinary General Knowledge'..."
2,2.1,"Schrift, Handschriftenkunde",False,"'Schrift, Handschriftenkunde' ist ein Fachgebi...","'Schrift, Handschriftenkunde' (Paleography and..."
3,2.2,"Buchwissenschaft, Buchhandel",False,Buchwissenschaft und Buchhandel sind interdisz...,Book studies and book trade are interdisciplin...
4,2.3,Presse,False,Das Fach 'Presse' befasst sich mit der Erforsc...,The subject 'Press' deals with the study and a...


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

def get_top_k_subjects(book_embedding, subject_embeddings, subject_df, k=5):
    """Find the top-k closest subjects for a given book embedding."""
    similarities = cosine_similarity([book_embedding], subject_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Top-k indices in descending order
    return [(subject_df.iloc[i]["Name"], subject_df.iloc[i]["Code"], similarities[i]) for i in top_k_indices]

def embed_and_find_closest_subjects(df_train, gnd_descriptions, k=5):
    # Embed all subjects once
    subject_texts = gnd_descriptions["English Description"].tolist()
    subject_embeddings = np.array([get_embedding(text) for text in tqdm(subject_texts, desc="Embedding Subjects")])

    top_subjects_per_book = []
    book_embeddings = []
    # Process each book
    for _, row in tqdm(df_train.iterrows(), total=df_train.shape[0], desc="Processing Books"):
        full_text = row["title"] + " " + row["abstract"]
        book_embedding = get_embedding(full_text)  # Embed the current book
        book_embeddings.append(book_embedding)
        top_subjects = get_top_k_subjects(book_embedding, subject_embeddings, gnd_descriptions, k)
        top_subjects_per_book.append(top_subjects)
        if len(top_subjects_per_book) % 100 == 0:
            print(f"Processed {len(top_subjects_per_book)} books")
            print(f"title: {row['title']}, abstract: {row['abstract']}")
            print(f"Top {k} subjects: {top_subjects}")
            print(book_embeddings[:5])  # Print first 5 book embeddings
            print(subject_embeddings[:5])  # Print first 5 subject embeddings

    df_train["top_subjects"] = top_subjects_per_book
    return df_train

# Run the process
df_train = embed_and_find_closest_subjects(df_train, gnd_descriptions, k=5)

# Display results
df_train[["title", "top_subjects"]].head()


In [22]:
import pandas as pd
import random
import wandb
from sentence_transformers import InputExample, SentenceTransformer, losses , evaluation
from torch.utils.data import DataLoader
import torch

# Convert DataFrames into training pairs
subject_to_description = dict(zip(gnd_descriptions["Name"], gnd_descriptions["English Description"]))
all_subject_descriptions = gnd_descriptions["English Description"].tolist()

def get_hard_negatives(row, k=3):
    """
    Extracts top-k hard negatives (highest similarity incorrect subjects) for training.
    """
    true_subjects = set(row["dcterms:subject_name"].replace("[", "").replace("]", "").replace("'", "").split(","))
    top_subjects = row["top_subjects"]  # List of (subject_name, subject_code, similarity_score)

    # Exclude true subjects
    hard_negatives = [subj for subj in top_subjects if subj[0] not in true_subjects]

    # Sort by similarity score (descending) and take top-k
    hard_negatives = sorted(hard_negatives, key=lambda x: x[2], reverse=True)[:k]

    # Return only subject names (or full tuples if needed)
    return [subj[0] for subj in hard_negatives]  # Return names only

# Apply to all rows in df_train
df_train["hard_negatives"] = df_train.apply(lambda row: get_hard_negatives(row, k=3), axis=1)



def prepare_data(df):
    """Prepare book-subject pairs for contrastive learning."""
    data = []
    for _, row in df.iterrows():
        book_text = row["title"] + " " + row["abstract"]
        dc_subject = row["dcterms:subject_name"]
        correct_subjects = dc_subject.replace("[", "").replace("]", "").replace("'", "").split(",")

        # Add Positive Pairs (Correct Subject Descriptions)
        for subject in correct_subjects:
            subject_desc = subject_to_description.get(subject, subject)  # Use description if available
            data.append(InputExample(texts=[book_text, subject_desc]))

        for neg_subj in row["hard_negatives"]:
            neg_desc = subject_to_description.get(neg_subj, neg_subj)
            data.append(InputExample(texts=[book_text, neg_desc]))

    return data


train_data = prepare_data(df_train)
#dev_data = prepare_data(df_dev)
print("done")


done


In [23]:
# Load Pretrained Model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert data into DataLoader
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
#dev_dataloader = DataLoader(dev_data, batch_size=16, shuffle=False)  # No shuffle for validation

# Use MultipleNegativesRankingLoss for contrastive learning
train_loss = losses.MultipleNegativesRankingLoss(model)

# Define evaluation metric using Cosine Similarity
#evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_data, name="dev")

# Training Arguments
epochs = 4
warmup_steps = 200
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Fine-tune the model using `fit()`
print("\n🚀 Starting Fine-Tuning...")
for epoch in range(epochs):
    print(f"\n🔄 Epoch {epoch + 1}/{epochs}")

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,  
        warmup_steps=warmup_steps,
        output_path="fine_tuned_book_subject_model",
        save_best_model=True
    )

    df_train["full_text"] = df_train["title"] + " " + df_train["abstract"]
    book_embeddings = model.encode(df_train["full_text"].tolist(), show_progress_bar=True)
    
    # Encode Subjects
    subject_embeddings = model.encode(gnd_descriptions["English Description"].tolist(), show_progress_bar=True)
    
    
    predictions = [
    get_top_k_subjects(book_embeddings[i], subject_embeddings, gnd_descriptions)
    for i in tqdm(range(len(book_embeddings)))]

    evaluate_predictions(df_train,predictions)


    # ✅ Save model to wandb after each epoch
    model_path = f"fine_tuned_checkpoints/model_epoch_{epoch+1}.pth"
    model.save(model_path)

    # ✅ Log model to wandb
    wandb.save(model_path)
    wandb.log({"epoch": epoch + 1, "model_saved": model_path})
    print(f"✅ Model saved at {model_path} and logged to WandB.")

wandb.finish()



🚀 Starting Fine-Tuning...

🔄 Epoch 1/4


Step,Training Loss
500,2.4115
1000,2.0855
1500,1.9754
2000,1.9009
2500,1.8361
3000,1.7736
3500,1.7527
4000,1.7168
4500,1.7
5000,1.669


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Batches:   0%|          | 0/3012 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

100%|██████████| 96353/96353 [02:09<00:00, 742.82it/s]
96353it [32:15, 49.79it/s]


📊 Model Evaluation Results:
✅ Precision@5: 0.0000
✅ Recall@5: 0.0000
✅ F1-score@5: 0.0000
✅ Model saved at fine_tuned_checkpoints/model_epoch_1.pth and logged to WandB.

🔄 Epoch 2/4


Step,Training Loss
500,1.38
1000,1.3259
1500,1.3197
2000,1.3153
2500,1.3095
3000,1.2862
3500,1.2787
4000,1.2688
4500,1.2808
5000,1.2643


Batches:   0%|          | 0/3012 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

100%|██████████| 96353/96353 [02:07<00:00, 758.42it/s]
96353it [31:07, 51.60it/s]


📊 Model Evaluation Results:
✅ Precision@5: 0.0000
✅ Recall@5: 0.0000
✅ F1-score@5: 0.0000
✅ Model saved at fine_tuned_checkpoints/model_epoch_2.pth and logged to WandB.

🔄 Epoch 3/4


Step,Training Loss
500,1.1507
1000,1.0657
1500,1.0654
2000,1.0658
2500,1.07
3000,1.052
3500,1.0434
4000,1.0411
4500,1.0568
5000,1.0429


Batches:   0%|          | 0/3012 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

100%|██████████| 96353/96353 [02:07<00:00, 753.57it/s]
96353it [33:42, 47.63it/s]


📊 Model Evaluation Results:
✅ Precision@5: 0.0000
✅ Recall@5: 0.0000
✅ F1-score@5: 0.0000
✅ Model saved at fine_tuned_checkpoints/model_epoch_3.pth and logged to WandB.

🔄 Epoch 4/4


Step,Training Loss
500,0.9762
1000,0.8547
1500,0.8551
2000,0.8562
2500,0.8645
3000,0.8517
3500,0.8424
4000,0.8447
4500,0.8633
5000,0.8475


Batches:   0%|          | 0/3012 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

100%|██████████| 96353/96353 [02:08<00:00, 746.99it/s]
96353it [32:33, 49.34it/s]


📊 Model Evaluation Results:
✅ Precision@5: 0.0000
✅ Recall@5: 0.0000
✅ F1-score@5: 0.0000
✅ Model saved at fine_tuned_checkpoints/model_epoch_4.pth and logged to WandB.


0,1
epoch,▁▃▁▃▆█
train/epoch,▁▂▃▃▅▆▇▇▇▇▃▃▃▄▅▅▅▆▇▇▂▃▄▅▅▆▇█▂▃▄▇▇█▂▃▄▄▅█
train/global_step,▂▃▃▄▄▄▆██▁▂▂▃▄▄▆▆▇██▂▃▄▆▇▂▂▃▄▅▆▆▆▇▇▂▂▂▄█
train/grad_norm,▅▂▄▄▅▅▃▆▄▅▆▄▅▄▅▃▅▃▄▄▃▄▅▆▅▅▄▆▄▅▅▅▁▅▆█▆▇█▆
train/learning_rate,██▇▆▆▅▄▃▂█▇▇▆▆▅▄▂▂▂▁▇▇▇▄▃▃▂█▇▇▃▂▇▆▅▃▃▂▂▁
train/loss,█▅▅▅▄▄▄▄▄▆▅▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▂▂

0,1
epoch,4
model_saved,fine_tuned_checkpoin...
total_flos,0
train/epoch,1
train/global_step,17756
train/grad_norm,13.80268
train/learning_rate,1e-05
train/loss,1.0084
train_loss,0.91081
train_runtime,4301.0702


In [24]:
# Load fine-tuned model
model = SentenceTransformer("fine_tuned_book_subject_model")

# Encode Books
df_train["full_text"] = df_train["title"] + " " + df_train["abstract"]
book_embeddings = model.encode(df_train["full_text"].tolist(), show_progress_bar=True)

# Encode Subjects
subject_embeddings = model.encode(gnd_descriptions["English Description"].tolist(), show_progress_bar=True)

# Compute Similarity & Find Best Matches
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_k_subjects(book_embedding, subject_embeddings, subject_df, k=5):
    """Find the top-k closest subjects for a given book embedding."""
    similarities = cosine_similarity([book_embedding], subject_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]

    return [(subject_df.iloc[i]["Name"], subject_df.iloc[i]["Code"], similarities[i]) for i in top_k_indices]

df_train["top_subjects"] = [
    get_top_k_subjects(book_embeddings[i], subject_embeddings, gnd_descriptions)
    for i in range(len(book_embeddings))
]

# Show Results
df_train[["title", "top_subjects"]].head()



Batches:   0%|          | 0/3012 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Unnamed: 0,title,top_subjects
0,New challenges in the use of governement debt ...,"[(Bank, 10.9b, 0.37885815), (Außenpolitik, 8.3..."
2,Chapter 102 The Becker–DeGroot–Marschak Mechan...,"[(Wirtschaftsgeschichte, 10.1, 0.34183115), (V..."
4,Private pensions and the financial crisis : Ho...,"[(Finanzrecht, 7.9a, 0.4412047), (Bank, 10.9b,..."
6,Chapter 68 The Combinatorial Auction,"[(Wirtschaftsgeschichte, 10.1, 0.39000505), (V..."
7,OECD Financial Outreach Activities in 2003,"[(Bank, 10.9b, 0.36117074), (Finanzrecht, 7.9a..."


In [40]:
model.save("fine_tuned_book_subject_model.pth")
torch.save(model,"fine_tuned_book_subject_model2.pth")

# ✅ Use Artifacts for Better Syncing
artifact = wandb.Artifact("fine_tuned_model", type="model")
artifact.add_file("/kaggle/working/fine_tuned_book_subject_model2.pth")
wandb.init()
wandb.log_artifact(artifact)

# ✅ Force sync before session ends
wandb.finish()

In [26]:
df_train[["title", "top_subjects"]].to_csv("tib_descriptons2.csv")

In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_predictions(df):
    """Compute Precision@5, Recall@5, and F1@5 for subject classification."""
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for i, row in tqdm(df.iterrows()):
        true_subjects = set(row["classification_numbers"])
        predicted_subjects = set([subj[1] for subj in row["top_subjects"] ])  # Extract predicted names

        if not true_subjects:  # Skip if no ground truth subjects
            continue

        # Compute Precision@5, Recall@5, and F1-score@5
        correct_predictions = len(predicted_subjects & true_subjects)
        precision_at_5 = correct_predictions / 5  # k=5
        recall_at_5 = correct_predictions / len(true_subjects)  # Relative to actual labels
        f1_at_5 = 2 * (precision_at_5 * recall_at_5) / (precision_at_5 + recall_at_5) if (precision_at_5 + recall_at_5) > 0 else 0

        precision_scores.append(precision_at_5)
        recall_scores.append(recall_at_5)
        f1_scores.append(f1_at_5)

    # Compute dataset-wide averages
    avg_precision = sum(precision_scores) / len(precision_scores)
    avg_recall = sum(recall_scores) / len(recall_scores)
    avg_f1 = sum(f1_scores) / len(f1_scores)

    print("📊 Model Evaluation Results:")
    print(f"✅ Precision@5: {avg_precision:.4f}")
    print(f"✅ Recall@5: {avg_recall:.4f}")
    print(f"✅ F1-score@5: {avg_f1:.4f}")

evaluate_predictions(df_train)

96353it [00:04, 21043.72it/s]

📊 Model Evaluation Results:
✅ Precision@5: 0.0982
✅ Recall@5: 0.2473
✅ F1-score@5: 0.1318





In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_k_subjects(book_embedding, subject_embeddings, subject_df, k=5):
    """Find the top-k closest subjects for a given book embedding."""
    similarities = cosine_similarity([book_embedding], subject_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]

    return [(subject_df.iloc[i]["Name"], subject_df.iloc[i]["Code"], similarities[i]) for i in top_k_indices]
