In [None]:
!pip install -U datasets fsspec aiohttp huggingface_hub --quiet

In [None]:
!rm -rf ~/.cache/huggingface/datasets

In [None]:
from google.colab import drive
from datasets import load_dataset, Dataset
import os
import zipfile
import gdown
import pandas as pd


drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
def backup_colab_content_to_drive(folder_name='Colab Notebooks'):
  import shutil
  import os

  src = '/content'
  dest = f'/content/drive/MyDrive/{folder_name}'
  os.makedirs(dest, exist_ok=True)

  for item in os.listdir(src):
    if item == 'drive':
      continue
    s = os.path.join(src, item)
    d = os.path.join(dest, item)
    if os.path.isdir(s):
      shutil.copytree(s, d)
    else:
      shutil.copy2(s, d)

  print(f'📁 Backup complete. Files saved to: {dest}')


def download_and_extract_from_gdrive(gdrive_url, output_dir="open4b_data"):
    # Convert Google Drive shareable link to direct download link
    file_id = gdrive_url.split("/d/")[1].split("/")[0]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Set zip path
    zip_path = "open4b_data.zip"

    # Download the zip file
    print("📥 Downloading zip file...")
    gdown.download(download_url, zip_path, quiet=False)

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Unzip
    print("📦 Extracting files...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

    # List files
    print("📁 Files extracted:")
    for root, dirs, files in os.walk(output_dir):
      for file in files:
        print(" -", os.path.join(root, file))

    # Optionally, return list of file paths
    file_paths = []
    for root, _, files in os.walk(output_dir):
        for file in files:
            file_paths.append(os.path.join(root, file))

    return file_paths


def preprocess_dataset(dataset, text_key, summary_key):
  """Gets the loaded dataset from Hugging Face, combines data from train, validation and test sets and returns datas in pandas dataframe"""
  texts = []
  summaries = []

  for split in ['train', 'validation', 'test']:
    texts.extend(dataset[split][text_key])
    summaries.extend(dataset[split][summary_key])

  return pd.DataFrame({'text': texts, 'summary': summaries})

In [None]:
business_keywords = ['business', 'price optimization', 'supply chain', 'supply network', 'logistics', 'marketing', 'customer', 'product management', 'brand management', 'recommender', 'finance', 'investment']
ds_words = ['machine learning', ' ml ', 'data science', ' ds ', 'deep learning', ' dl ', 'artificial intelligence', ' ai ', 'reinforcement learning', 'automatization', 'big data', 'data modeling', 'natural language processing', ' nlp ', ' llm ', 'computer vision', ' cv ', 'neural network', ' nn ', 'digital twin']

In [None]:
# Scientific papers dataset
dataset_arxiv = load_dataset("scientific_papers", "arxiv")
dataset_arxiv = dataset_arxiv.filter(lambda x: any(k in x['article'].lower() for k in business_keywords))
print(dataset_arxiv)
df_arxiv = preprocess_dataset(dataset_arxiv, 'article', 'abstract')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

scientific_papers.py:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

The repository for scientific_papers contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/scientific_papers.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

Filter:   0%|          | 0/203037 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6436 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6440 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 5507
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 232
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 188
    })
})


In [None]:
# scitldr dataset loading
dataset_ml = load_dataset("scitldr", "FullText")
print(dataset_ml)
df_ml = preprocess_dataset(dataset_ml, 'source', 'target')
for column in df_ml.columns:
  df_ml[column] = df_ml[column].apply(lambda x: " ".join(x))

README.md:   0%|          | 0.00/8.81k [00:00<?, ?B/s]

scitldr.py:   0%|          | 0.00/7.21k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

The repository for scitldr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/scitldr.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/71.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1992 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/618 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/619 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 1992
    })
    test: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 618
    })
    validation: Dataset({
        features: ['source', 'source_labels', 'rouge_scores', 'paper_id', 'target'],
        num_rows: 619
    })
})


In [None]:
# xsum BBC news dataset loading
dataset_bbc = load_dataset("xsum")
dataset_bbc = dataset_bbc.filter(lambda x: any(k in x['document'].lower() for k in ds_words))
print(dataset_bbc)
df_bbc = preprocess_dataset(dataset_bbc, 'document', 'summary')

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

The repository for xsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/xsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


(…)SUM-EMNLP18-Summary-Data-Original.tar.gz:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Filter:   0%|          | 0/204045 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11332 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11334 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 781
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 28
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 56
    })
})


In [None]:
file_list = download_and_extract_from_gdrive("https://drive.google.com/file/d/1w5mc6vxXrHIPRbRpoOxbUo8yTdVkW6l5/view")

o4b_sources = ['train.source', 'test.source', 'val.source']
o4b_targets = ['train.target', 'test.target', 'val.target']

o4b_source_full = []
for file_name in o4b_sources:
  with open([f for f in file_list if file_name in f][0], "r") as f:
    lines = f.readlines()
  print(f'{file_name} lenght is {len(lines)}')
  o4b_source_full.extend(lines)
print(f'Sources length is {len(o4b_source_full)}')

o4b_target_full = []
for file_name in o4b_targets:
  with open([f for f in file_list if file_name in f][0], "r") as f:
    lines = f.readlines()
  print(f'{file_name} lenght is {len(lines)}')
  o4b_target_full.extend(lines)
print(f'Targets length is {len(o4b_target_full)}')

df_o4b = pd.DataFrame({'text': o4b_source_full, 'summary': o4b_target_full})

# Filter by technical keywords
pattern_ds = '|'.join(ds_words)
df_o4b = df_o4b[df_o4b['text'].str.contains(pattern_ds, case=False, na=False)]

In [None]:
df_list = [df_arxiv, df_bbc, df_ml, df_o4b]
for item in df_list:
  print(f'Shape {item.shape}')

df_full = pd.concat(df_list, axis=0, ignore_index=True)
print(df_full.shape)

Shape (5927, 2)
Shape (865, 2)
Shape (3229, 2)
Shape (2388, 2)
(12409, 2)


In [None]:
df_full.to_pickle('summ_data.pickle')

In [None]:
backup_colab_content_to_drive('robot_dreams/backup/')

📁 Backup complete. Files saved to: /content/drive/MyDrive/robot_dreams/backup/
