In [1]:
# Import necessary libraries
import os
import pandas as pd
import json
from git import Repo

# Clone the GitHub repository
git_url = "https://github.com/jd-coderepos/llms4subjects.git"
repo_dir = "llms4subjects"

if not os.path.exists(repo_dir):
    print("Cloning repository...")
    Repo.clone_from(git_url, repo_dir)
else:
    print("Repository already cloned.")

# Path to the target folder
data_folders = ["Article", "Book", "Conference", "Report", "Thesis"]
#data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", "tib-core-subjects", "data", "train", "Book", "en")

# Verify the folder exists
#if not os.path.exists(data_folder):
#    raise FileNotFoundError(f"The folder {data_folder} does not exist.")


Cloning repository...


In [2]:
def get_raw_dict_book(book, file_name, folder):
  raw = {}
  data = {}
  label = {}
  for item in book["@graph"]:
    if "title" in item:
      data["file_name"] = file_name
      data["folder"] = folder
      data["title"] = item["title"]
      data["abstract"] = item["abstract"]
      if ("creator" in item):
        gnd_creator = []
        if (isinstance(item["creator"], list)):
          gnd_creator = item["creator"]
        else:
          gnd_creator = [item["creator"]]
        name_creator = []
        for gnd in gnd_creator:
          for finder in book["@graph"]:
            if "@id" in finder and finder["@id"] == gnd:
              name_creator.append(finder["sameAs"])
        data["creator"] = name_creator
      if ("publisher" in item):
        data["publisher"] = item["publisher"]
      if ("@id" in item["dcterms:subject"]):
        item["dcterms:subject"] = [item["dcterms:subject"]]
      label["dcterms:subject"] = [x["@id"] for x in item["dcterms:subject"]]
      dcterms_name = []
      for dcterms in label["dcterms:subject"]:
        for finder in book["@graph"]:
          if "@id" in finder and finder["@id"] == dcterms:
            dcterms_name.append(finder["sameAs"])
      label["dcterms:subject_name"] = dcterms_name
      raw.update(data)
      raw.update(label)
  return raw


In [3]:

# Iterate through all JSON-LD files in the folder
langs = ["de","en"]
core_all = ["tib-core-subjects","all-subjects"]
for lang in langs:
  all_data = []
  for fold in core_all:
    for data_type in ["train", "dev"]:
      for folder in data_folders :
        data_folder = os.path.join(repo_dir, "shared-task-datasets", "TIBKAT", fold, "data", data_type, folder, lang)
        if not os.path.exists(data_folder):
          raise FileNotFoundError(f"The folder {data_folder} does not exist.")
        for i, file_name in enumerate(os.listdir(data_folder)):
            if file_name.endswith(".jsonld"):
                if (i % 100 == 0):
                  print(f"Processing file {i}: {file_name}")
                file_path = os.path.join(data_folder, file_name)
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        # Load JSON-LD data
                        json_data = json.load(f)
                        preprocessed_data = get_raw_dict_book(json_data, file_name, folder)
                        all_data.append(preprocessed_data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding {file_name}: {e}")
      dataframe = pd.DataFrame(all_data)
      output_file = f"tibkat_{lang}_{fold}_{data_type}.csv"
      dataframe.to_csv(output_file, index=False)

      print(f"Data saved to {output_file}")


Processing file 0: 3A1762285517.jsonld
Processing file 0: 3A1659040906.jsonld
Processing file 100: 3A277682614.jsonld
Processing file 200: 3A748992456.jsonld
Processing file 300: 3A1801171289.jsonld
Processing file 400: 3A895372649.jsonld
Processing file 500: 3A1657820750.jsonld
Processing file 600: 3A1678263028.jsonld
Processing file 700: 3A310621712.jsonld
Processing file 800: 3A1659077400.jsonld
Processing file 900: 3A617876797.jsonld
Processing file 1000: 3A599818875.jsonld
Processing file 1100: 3A1650551665.jsonld
Processing file 1200: 3A1629341835.jsonld
Processing file 1300: 3A1653211563.jsonld
Processing file 1400: 3A165361255X.jsonld
Processing file 1500: 3A1646398149.jsonld
Processing file 1600: 3A1832229963.jsonld
Processing file 1700: 3A1658874773.jsonld
Processing file 1800: 3A1655501259.jsonld
Processing file 1900: 3A302011080.jsonld
Processing file 2000: 3A502981717.jsonld
Processing file 2100: 3A1885725701.jsonld
Processing file 2200: 3A772521425.jsonld
Processing file 

In [1]:
import os
import pandas as pd
import json
df_train_en = pd.read_csv("tibkat_en_all-subjects_train.csv")
df_train_de = pd.read_csv("tibkat_de_all-subjects_train.csv")
df_train = pd.concat([df_train_en, df_train_de], ignore_index=True)
df_train.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A1770703179.jsonld,Article,Methods of usability testing for users with co...,One challenge of the user-centered development...,"['Krömker, Heidi']",Springer,"['gnd:4005541-3', 'gnd:4125909-9']","['Benutzerfreundlichkeit', 'Mensch-Maschine-Ko..."
1,3A1831640554.jsonld,Article,Chapter 1 Computable general equilibrium model...,This chapter describes computable general equi...,[],Elsevier,"['gnd:4066528-8', 'gnd:4132280-0', 'gnd:414825...","['Wirtschaftswissenschaften', 'Ökonometrie', '..."
2,3A1831633507.jsonld,Article,Chapter 2 From Market Jaws to the Newton Metho...,This chapter focuses on the geometry of how a ...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:406652...","['Methodologie', 'Experimentelle Wirtschaftsfo..."
3,3A730043908.jsonld,Article,Guidelines for Insurers' Governance,"On 28 April 2005, the OECD Council approved th...",[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']"
4,3A730025837.jsonld,Article,Challenges for financial intermediaries offeri...,The present article focuses on issues related ...,[],OECD Publishing,"['gnd:4068097-6', 'gnd:4073788-3', 'gnd:406748...","['Zukunft', 'Kreditmarkt', 'Zeitschrift']"


In [2]:
df_dev_en = pd.read_csv("tibkat_en_all-subjects_dev.csv")
df_dev_de = pd.read_csv("tibkat_de_all-subjects_dev.csv")
df_dev = pd.concat([df_dev_en, df_dev_de], ignore_index=True)
df_dev.head()

Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name
0,3A1770703179.jsonld,Article,Methods of usability testing for users with co...,One challenge of the user-centered development...,"['Krömker, Heidi']",Springer,"['gnd:4005541-3', 'gnd:4125909-9']","['Benutzerfreundlichkeit', 'Mensch-Maschine-Ko..."
1,3A1831640554.jsonld,Article,Chapter 1 Computable general equilibrium model...,This chapter describes computable general equi...,[],Elsevier,"['gnd:4066528-8', 'gnd:4132280-0', 'gnd:414825...","['Wirtschaftswissenschaften', 'Ökonometrie', '..."
2,3A1831633507.jsonld,Article,Chapter 2 From Market Jaws to the Newton Metho...,This chapter focuses on the geometry of how a ...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:406652...","['Methodologie', 'Experimentelle Wirtschaftsfo..."
3,3A730043908.jsonld,Article,Guidelines for Insurers' Governance,"On 28 April 2005, the OECD Council approved th...",[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']"
4,3A730025837.jsonld,Article,Challenges for financial intermediaries offeri...,The present article focuses on issues related ...,[],OECD Publishing,"['gnd:4068097-6', 'gnd:4073788-3', 'gnd:406748...","['Zukunft', 'Kreditmarkt', 'Zeitschrift']"


In [3]:
import json
import pandas as pd
with open("llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as f:
    all_gnds = json.load(f)
all_gnds = pd.DataFrame(all_gnds)
all_gnds.head()



Unnamed: 0,Code,Classification Number,Classification Name,Name,Alternate Name,Related Subjects,Source,Definition,Source URL
0,gnd:4003694-7,0,Unspezifische Allgemeinwörter,Ausbreitung,[],[],Du.,,
1,gnd:4032393-6,0,Unspezifische Allgemeinwörter,Koordination,[Koordinierung],[],M 1.,Verknüpfe mit Anwendungsgebiet,
2,gnd:4038971-6,0,Unspezifische Allgemeinwörter,Methode,"[Methodik, Verfahren,Methode, Technik,Methode,...",[Methodologie],M,,
3,gnd:4043744-9,0,Unspezifische Allgemeinwörter,Ordnung,[],"[Unordnung, Ordnen]",M 1.,"Allgemeinbegriff, verknüpfe mit Anwendungsgebiet",
4,gnd:4048300-9,0,Unspezifische Allgemeinwörter,Rahmen,[],[],M,"Etwas, was einer Sache ein bestimmtes (äußere...",


In [4]:
from tqdm import tqdm
import pandas as pd

# Create a fast lookup dictionary
code_to_classification = all_gnds.set_index("Code")["Classification Number"].to_dict()

def get_classification_numbers(dc_subject):
    """Convert subject string into a list of classification numbers"""
    subjects = dc_subject.replace("[", "").replace("]", "").replace("'", "").split(",")
    return [code_to_classification.get(subject.strip(), None) for subject in subjects]

# Use apply() for vectorized processing
tqdm.pandas()
df_train["classification_numbers"] = df_train["dcterms:subject"].progress_apply(get_classification_numbers)

df_train.head()

100%|██████████| 130819/130819 [00:00<00:00, 134614.71it/s]


Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,classification_numbers
0,3A1770703179.jsonld,Article,Methods of usability testing for users with co...,One challenge of the user-centered development...,"['Krömker, Heidi']",Springer,"['gnd:4005541-3', 'gnd:4125909-9']","['Benutzerfreundlichkeit', 'Mensch-Maschine-Ko...","[30, 30]"
1,3A1831640554.jsonld,Article,Chapter 1 Computable general equilibrium model...,This chapter describes computable general equi...,[],Elsevier,"['gnd:4066528-8', 'gnd:4132280-0', 'gnd:414825...","['Wirtschaftswissenschaften', 'Ökonometrie', '...","[10.2a, 10.2ac, 30, 30, 10.11b]"
2,3A1831633507.jsonld,Article,Chapter 2 From Market Jaws to the Newton Metho...,This chapter focuses on the geometry of how a ...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:406652...","['Methodologie', 'Experimentelle Wirtschaftsfo...","[4.3, 10.2aa, 10.2a, 10.2aa, 18]"
3,3A730043908.jsonld,Article,Guidelines for Insurers' Governance,"On 28 April 2005, the OECD Council approved th...",[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']","[10.9c, 1, 2.3]"
4,3A730025837.jsonld,Article,Challenges for financial intermediaries offeri...,The present article focuses on issues related ...,[],OECD Publishing,"['gnd:4068097-6', 'gnd:4073788-3', 'gnd:406748...","['Zukunft', 'Kreditmarkt', 'Zeitschrift']","[1, 10.9c, 2.3]"


In [5]:
df_dev["classification_numbers"] = df_dev["dcterms:subject"].progress_apply(get_classification_numbers)

df_dev.head()

100%|██████████| 144485/144485 [00:00<00:00, 210504.42it/s]


Unnamed: 0,file_name,folder,title,abstract,creator,publisher,dcterms:subject,dcterms:subject_name,classification_numbers
0,3A1770703179.jsonld,Article,Methods of usability testing for users with co...,One challenge of the user-centered development...,"['Krömker, Heidi']",Springer,"['gnd:4005541-3', 'gnd:4125909-9']","['Benutzerfreundlichkeit', 'Mensch-Maschine-Ko...","[30, 30]"
1,3A1831640554.jsonld,Article,Chapter 1 Computable general equilibrium model...,This chapter describes computable general equi...,[],Elsevier,"['gnd:4066528-8', 'gnd:4132280-0', 'gnd:414825...","['Wirtschaftswissenschaften', 'Ökonometrie', '...","[10.2a, 10.2ac, 30, 30, 10.11b]"
2,3A1831633507.jsonld,Article,Chapter 2 From Market Jaws to the Newton Metho...,This chapter focuses on the geometry of how a ...,[],North Holland,"['gnd:4139716-2', 'gnd:4252654-1', 'gnd:406652...","['Methodologie', 'Experimentelle Wirtschaftsfo...","[4.3, 10.2aa, 10.2a, 10.2aa, 18]"
3,3A730043908.jsonld,Article,Guidelines for Insurers' Governance,"On 28 April 2005, the OECD Council approved th...",[],OECD Publishing,"['gnd:4073788-3', 'gnd:4068097-6', 'gnd:406748...","['Kreditmarkt', 'Zukunft', 'Zeitschrift']","[10.9c, 1, 2.3]"
4,3A730025837.jsonld,Article,Challenges for financial intermediaries offeri...,The present article focuses on issues related ...,[],OECD Publishing,"['gnd:4068097-6', 'gnd:4073788-3', 'gnd:406748...","['Zukunft', 'Kreditmarkt', 'Zeitschrift']","[1, 10.9c, 2.3]"


In [6]:
gnd_descriptions = pd.read_csv("gnd_subject_names_descriptions.csv")
gnd_descriptions.head()

Unnamed: 0,Code,Name,TIB Core,German Description,English Description
0,0.0,Unspezifische Allgemeinwörter,True,'Unspezifische Allgemeinwörter' bezeichnen Wör...,'Unspezifische Allgemeinwörter' (non-specific ...
1,1.0,"Allgemeines, Interdisziplinäre Allgemeinwörter",True,"'Allgemeines, Interdisziplinäre Allgemeinwörte...","'General, Interdisciplinary General Knowledge'..."
2,2.1,"Schrift, Handschriftenkunde",False,"'Schrift, Handschriftenkunde' ist ein Fachgebi...","'Schrift, Handschriftenkunde' (Paleography and..."
3,2.2,"Buchwissenschaft, Buchhandel",False,Buchwissenschaft und Buchhandel sind interdisz...,Book studies and book trade are interdisciplin...
4,2.3,Presse,False,Das Fach 'Presse' befasst sich mit der Erforsc...,The subject 'Press' deals with the study and a...


In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the tokenizer and model
MODEL_NAME = "xlm-roberta-base"  # Can use "xlm-roberta-large" for better accuracy
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
model = XLMRobertaModel.from_pretrained(MODEL_NAME)
model.eval()  # Set model to evaluation mode


In [None]:
def get_embedding(text):
    """Convert text into an embedding using XLM-RoBERTa"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=32, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation


# Embed all subjects
subjects = gnd_descriptions["English Description"].tolist()
subject_embeddings = np.array([get_embedding(subj) for subj in subjects])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

def get_top_k_subjects(book_embedding, subject_embeddings, subject_df, k=5):
    """Find the top-k closest subjects for a given book embedding."""
    similarities = cosine_similarity([book_embedding], subject_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Top-k indices in descending order
    return [(subject_df.iloc[i]["Name"], subject_df.iloc[i]["Code"], similarities[i]) for i in top_k_indices]

def embed_and_find_closest_subjects(df_train, gnd_descriptions, k=5):
    # Embed all subjects once
    subject_texts = gnd_descriptions["English Description"].tolist()
    subject_embeddings = np.array([get_embedding(text) for text in tqdm(subject_texts, desc="Embedding Subjects")])

    top_subjects_per_book = []
    book_embeddings = []
    # Process each book
    for _, row in tqdm(df_train.iterrows(), total=df_train.shape[0], desc="Processing Books"):
        full_text = row["title"] + " " + row["abstract"]
        book_embedding = get_embedding(full_text)  # Embed the current book
        book_embeddings.append(book_embedding)
        top_subjects = get_top_k_subjects(book_embedding, subject_embeddings, gnd_descriptions, k)
        top_subjects_per_book.append(top_subjects)
        if len(top_subjects_per_book) % 100 == 0:
            print(f"Processed {len(top_subjects_per_book)} books")
            print(f"title: {row['title']}, abstract: {row['abstract']}")
            print(f"Top {k} subjects: {top_subjects}")
            print(book_embeddings[:5])  # Print first 5 book embeddings
            print(subject_embeddings[:5])  # Print first 5 subject embeddings

    df_train["top_subjects"] = top_subjects_per_book
    return df_train

# Run the process
df_train = embed_and_find_closest_subjects(df_train, gnd_descriptions, k=5)

# Display results
df_train[["title", "top_subjects"]].head()


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained Sentence Transformer model (optimized for multilingual similarity)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [7]:
import pandas as pd
import random
import wandb
from sentence_transformers import InputExample, SentenceTransformer, losses , evaluation
from torch.utils.data import DataLoader
import torch

# Convert DataFrames into training pairs
subject_to_description = dict(zip(gnd_descriptions["Name"], gnd_descriptions["English Description"]))
all_subject_descriptions = gnd_descriptions["English Description"].tolist()

def prepare_data(df):
    """Prepare book-subject pairs for contrastive learning."""
    data = []
    for _, row in df.iterrows():
        book_text = row["title"] + " " + row["abstract"]
        dc_subject = row["dcterms:subject_name"]
        correct_subjects = dc_subject.replace("[", "").replace("]", "").replace("'", "").split(",")

        # Add Positive Pairs (Correct Subject Descriptions)
        for subject in correct_subjects:
            subject_desc = subject_to_description.get(subject, subject)  # Use description if available
            data.append(InputExample(texts=[book_text, subject_desc]))

        # Add Negative Pairs (Incorrect Subject Descriptions)
        num_negatives = min(3, len(all_subject_descriptions) - len(correct_subjects))
        negative_descriptions = random.sample(
            [desc for desc in all_subject_descriptions if desc not in correct_subjects], num_negatives
        )

        for neg_desc in negative_descriptions:
            data.append(InputExample(texts=[book_text, neg_desc]))

    return data


train_data = prepare_data(df_train)
dev_data = prepare_data(df_dev)



In [None]:


# Load Pretrained Model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert data into DataLoader
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
dev_dataloader = DataLoader(dev_data, batch_size=16, shuffle=False)  # No shuffle for validation

# Use MultipleNegativesRankingLoss for contrastive learning
train_loss = losses.MultipleNegativesRankingLoss(model)

# Define evaluation metric using Cosine Similarity
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_data, name="dev")

# Training Arguments
epochs = 3
warmup_steps = 100
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Fine-tune the model using `fit()`
print("\n🚀 Starting Fine-Tuning...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path="fine_tuned_book_subject_model",
    save_best_model=True  # Saves the best model automatically
)

# Save fine-tuned model
model.save("fine_tuned_book_subject_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



🚀 Starting Fine-Tuning...


[34m[1mwandb[0m: Currently logged in as: [33mtaha-mtjn[0m ([33mtaha-mtjn-sharif-university-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
500,2.4196
1000,2.1088
1500,2.0726
2000,1.9967
2500,1.9982
3000,1.978
3500,1.9353
4000,1.9217
4500,1.9377
5000,1.9017


Step,Training Loss
500,2.4196
1000,2.1088
1500,2.0726
2000,1.9967
2500,1.9982
3000,1.978
3500,1.9353
4000,1.9217
4500,1.9377
5000,1.9017


In [None]:
# Load fine-tuned model
model = SentenceTransformer("fine_tuned_book_subject_model")

# Encode Books
df_train["full_text"] = df_train["title"] + " " + df_train["abstract"]
book_embeddings = model.encode(df_train["full_text"].tolist(), show_progress_bar=True)

# Encode Subjects
subject_embeddings = model.encode(gnd_descriptions["description"].tolist(), show_progress_bar=True)

# Compute Similarity & Find Best Matches
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_k_subjects(book_embedding, subject_embeddings, subject_df, k=5):
    """Find the top-k closest subjects for a given book embedding."""
    similarities = cosine_similarity([book_embedding], subject_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]

    return [(subject_df.iloc[i]["name"], subject_df.iloc[i]["number"], similarities[i]) for i in top_k_indices]

# Get Top 5 Subjects for Each Book
df_train["top_subjects"] = [
    get_top_k_subjects(book_embeddings[i], subject_embeddings, gnd_descriptions)
    for i in range(len(book_embeddings))
]

# Show Results
df_train[["title", "top_subjects"]].head()

In [20]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 