In [2]:
%pip install datasets pandas spacy tqdm

Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting spacy
  Using cached spacy-3.8.7-cp311-cp311-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-2.3.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.

In [3]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [4]:
from datasets import load_dataset

ds = load_dataset("trl-lib/tldr", split="train")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 116722/116722 [00:00<00:00, 803326.99 examples/s]
Generating validation split: 100%|██████████| 6447/6447 [00:00<00:00, 724058.21 examples/s]
Generating test split: 100%|██████████| 6553/6553 [00:00<00:00, 678146.41 examples/s]


In [5]:
import re

def split_prompt(example):
    text = example["prompt"]
    # Regex with DOTALL so that POST: can span multiple lines
    m = re.match(
        r"SUBREDDIT:\s*(?P<subreddit>.+?)\s+TITLE:\s*(?P<title>.+?)\s+POST:\s*(?P<post>.+?)\s+TL;DR:",
        text,
        flags=re.DOTALL,
    )
    if not m:
        return {"subreddit": None, "title": None, "post": text}
    return m.groupdict()

ds = ds.map(split_prompt, remove_columns=["prompt"])

Map: 100%|██████████| 116722/116722 [00:04<00:00, 24770.74 examples/s]


In [6]:
# ‘completion’ → ‘tldr’, drop any rows missing required fields
ds = ds.rename_column("completion", "tldr")
ds = ds.filter(lambda x: x["subreddit"] and x["title"] and x["post"] and x["tldr"])

Filter: 100%|██████████| 116722/116722 [00:00<00:00, 318727.21 examples/s]


In [7]:
import unicodedata

def clean_text(example):
    for col in ["title", "post", "tldr"]:
        text = example[col].strip()
        text = unicodedata.normalize("NFKC", text)
        example[col] = text
    return example

ds = ds.map(clean_text)

Map: 100%|██████████| 116722/116722 [00:04<00:00, 26842.21 examples/s]


In [8]:
import spacy
from tqdm.auto import tqdm

nlp = spacy.load("en_core_web_md")

def compute_similar(example):
    doc_post = nlp(example["post"])
    doc_tldr = nlp(example["tldr"])
    tldr_text = example["tldr"].lower()
    similar = {}
    for chunk in doc_post.noun_chunks:
        phrase = chunk.text.strip().lower()
        if len(phrase) < 3:
            continue
        # Binary importance if phrase appears in the TL;DR summary
        important = 1 if phrase in tldr_text else 0
        # Similarity score via spaCy vectors
        sim_score = float(doc_tldr.similarity(nlp(phrase)))
        if sim_score >= 0.75: # Threshold for similarity
            important = 1
        similar[phrase] = (important, sim_score)
    return {"similar": similar}

# Apply with a progress bar
records = []
for row in tqdm(ds, total=len(ds)):
    rec = dict(row)
    rec.update(compute_similar(row))
    records.append(rec)

# Convert back into a Dataset
from datasets import Dataset
ds = Dataset.from_pandas(pd.DataFrame(records))


  sim_score = float(doc_tldr.similarity(nlp(phrase)))
  7%|▋         | 8028/116722 [15:46<3:33:36,  8.48it/s]


KeyboardInterrupt: 

In [None]:
def keep_top_k(example, k: int = 30):
    items = sorted(
        example["similar"].items(),
        key=lambda kv: kv[1][1],
        reverse=True
    )[:k]
    example["similar"] = dict(items)
    return example

ds = ds.map(keep_top_k)

ds = ds.filter(lambda x: len(x["similar"]) > 0)

In [None]:
import json

# Convert the Dataset to a pandas DataFrame
df = ds.to_pandas()

# Serialize the nested `similar` dict into JSON strings
df['similar'] = df['similar'].apply(json.dumps)

# Write out to CSV (no index column)
df.to_csv("tldr_preprocessed.csv", index=False)

# loading the preprocessed data later:
# import pandas as pd
# df = pd.read_csv("tldr_preprocessed.csv")
# df['similar'] = df['similar'].apply(json.loads)