In [7]:
!pip install pdfplumber nltk scikit-learn

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import pdfplumber
import re
import json
import os
import glob
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
def extract_text_from_pdf(pdf_path):
    pages_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                pages_text.append(text)
    return "\n".join(pages_text)

def basic_clean(text: str) -> str:
    text = text.replace('\r', '\n')
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # collapse 3+ blank lines
    return text

def split_pdf_sections(clean_text, min_body_len=300):
    """
    Split PDF text into (title, body) sections using heuristics for headings.
    """
    lines = clean_text.split("\n")
    sections = []
    current_title = None
    current_lines = []

    def flush_section(title, buf):
        text = "\n".join(buf).strip()
        if title and text and len(text) >= min_body_len:
            sections.append((title.strip(), text))

    for line in lines:
        stripped = line.strip()
        # Heading heuristic: short-ish, all caps or numbered like "2.3 Something"
        if (len(stripped) > 0
            and len(stripped.split()) < 12
            and (stripped.isupper() or re.match(r'^\d+(\.\d+)*\s+', stripped))):
            flush_section(current_title, current_lines)
            current_title = stripped
            current_lines = []
        else:
            if stripped != "":
                current_lines.append(stripped)

    flush_section(current_title, current_lines)
    return sections

def split_markdown_sections(markdown_text, min_body_len=300):
    """
    Split markdown into (title, body) using headings starting with '#', '##', etc.
    """
    lines = markdown_text.split("\n")
    sections = []
    current_title = None
    current_lines = []

    def flush_section(title, buf):
        text = "\n".join(buf).strip()
        if title and text and len(text) >= min_body_len:
            sections.append((title.strip(), text))

    for line in lines:
        if line.strip().startswith("#"):
            # new heading
            flush_section(current_title, current_lines)
            current_title = re.sub(r'^#+', '', line).strip()
            current_lines = []
        else:
            if line.strip() != "":
                current_lines.append(line.strip())

    flush_section(current_title, current_lines)
    return sections

def first_n_sentences(text, n=3):
    sents = sent_tokenize(text)
    return " ".join(sents[:n])

def normalize_title_for_question(title: str) -> str:
    t = re.sub(r'^\d+(\.\d+)*\s*', '', title).strip()
    return t.lower()

def qa_from_sections(sections, source_tag):
    qa_pairs = []
    for title, body in sections:
        if len(body) < 300:
            continue
        clean_title = normalize_title_for_question(title)
        intro = first_n_sentences(body, n=3)
        if len(intro) < 50:
            continue

        # Q1: What is X?
        qa_pairs.append({
            "instruction": f"What is {clean_title} in data science?",
            "input": "",
            "output": intro,
            "source": source_tag,
            "section_title": title
        })

        # Q2: Explain X simply
        qa_pairs.append({
            "instruction": f"Explain {clean_title} in simple terms for a beginner in data science.",
            "input": "",
            "output": intro,
            "source": source_tag,
            "section_title": title
        })

        # Q3: When / why is X used?
        more = first_n_sentences(body, n=5)
        qa_pairs.append({
            "instruction": f"When or why would a data scientist use {clean_title}?",
            "input": "",
            "output": more,
            "source": source_tag,
            "section_title": title
        })

        # Q4: steps / workflow if algorithmic
        if any(k in body.lower() for k in ["algorithm", "procedure", "steps", "workflow", "process"]):
            algo_ans = first_n_sentences(body, n=6)
            qa_pairs.append({
                "instruction": f"Describe the typical steps involved in {clean_title} in a data science workflow.",
                "input": "",
                "output": algo_ans,
                "source": source_tag,
                "section_title": title
            })

    # Comparison questions between neighboring sections
    for i in range(1, len(sections)):
        t1, b1 = sections[i-1]
        t2, b2 = sections[i]
        c1 = normalize_title_for_question(t1)
        c2 = normalize_title_for_question(t2)
        ans1 = first_n_sentences(b1, 2)
        ans2 = first_n_sentences(b2, 2)
        if len(ans1) < 40 or len(ans2) < 40:
            continue
        qa_pairs.append({
            "instruction": f"How is {c1} different from {c2} in data science?",
            "input": "",
            "output": f"{c1}: {ans1} {c2}: {ans2}",
            "source": source_tag,
            "section_title": f"{t1} vs {t2}"
        })

    return qa_pairs

In [15]:
all_qa = []

# ISLR
print("Processing ISLR (islr.pdf)...")
islr_raw = extract_text_from_pdf("ISLP_website.pdf")
islr_clean = basic_clean(islr_raw)
islr_sections = split_pdf_sections(islr_clean, min_body_len=300)
print(f"  -> {len(islr_sections)} sections")
islr_qa = qa_from_sections(islr_sections, source_tag="book_islr")
print(f"  -> {len(islr_qa)} QA pairs")
all_qa.extend(islr_qa)

# FODS
print("\nProcessing Foundations of Data Science (fods.pdf)...")
fods_raw = extract_text_from_pdf("Foundations of Data Science.pdf")
fods_clean = basic_clean(fods_raw)
fods_sections = split_pdf_sections(fods_clean, min_body_len=300)
print(f"  -> {len(fods_sections)} sections")
fods_qa = qa_from_sections(fods_sections, source_tag="book_fods")
print(f"  -> {len(fods_qa)} QA pairs")
all_qa.extend(fods_qa)

print("\nQA count so far:", len(all_qa))


Processing ISLR (islr.pdf)...
  -> 1092 sections
  -> 4447 QA pairs

Processing Foundations of Data Science (fods.pdf)...
  -> 721 sections
  -> 3044 QA pairs

QA count so far: 7491


In [16]:
!pip install wikipedia
import wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=baadb640deab3bf4f5365208b55d84e03c20a165cae104e32c6ee9a737f78532
  Stored in directory: /root/.cache/pip/wheels/63/47/7c/a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [17]:
wiki_titles = [
    "Data science",
    "Machine learning",
    "Overfitting",
    "Cross-validation (statistics)",
    "Logistic regression",
    "Bias–variance tradeoff",
    "Random forest"
]

In [18]:
wiki_sections = []

for title in wiki_titles:
    try:
        page = wikipedia.page(title, auto_suggest=False)
        text = basic_clean(page.content)
        # split on headings like "== Something =="
        chunks = [c.strip() for c in re.split(r'\n==[^=]+==\n', text) if len(c.strip()) > 300]
        # first chunk is usually intro/definition
        for i, ch in enumerate(chunks):
            sec_title = f"{title} part {i+1}"
            wiki_sections.append((sec_title, ch))
    except Exception as e:
        print(f"Error fetching {title}: {e}")

print("Wikipedia sections:", len(wiki_sections))
wiki_qa = qa_from_sections(wiki_sections, source_tag="web_wikipedia")
print("Wikipedia QA pairs:", len(wiki_qa))

all_qa.extend(wiki_qa)
print("Total QA pairs after adding Wikipedia:", len(all_qa))

Wikipedia sections: 67
Wikipedia QA pairs: 310
Total QA pairs after adding Wikipedia: 7801


In [19]:
!pip install arxiv
import arxiv

Collecting arxiv
  Downloading arxiv-2.3.1-py3-none-any.whl.metadata (5.2 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.3.1-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=947b23d92095becc2943415ae62f132881f04bf79faa1524a513f35a6a059087
  Stored in directory: /root/.cache/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246
Successfully built sgmllib3k
Installing collected packag

In [20]:
search = arxiv.Search(
    query="machine learning survey OR data science introduction",
    max_results=20,
    sort_by=arxiv.SortCriterion.Relevance
)

arxiv_sections = []

for result in search.results():
    title = result.title.strip()
    abstract = result.summary.strip()
    if len(abstract) < 300:
        continue
    text = basic_clean(abstract)
    arxiv_sections.append((title, text))

print("arXiv sections:", len(arxiv_sections))
arxiv_qa = qa_from_sections(arxiv_sections, source_tag="web_arxiv")
print("arXiv QA pairs:", len(arxiv_qa))

all_qa.extend(arxiv_qa)
print("Total QA pairs after adding arXiv:", len(all_qa))

  for result in search.results():


arXiv sections: 20
arXiv QA pairs: 89
Total QA pairs after adding arXiv: 7890


In [21]:
import random

print("Sample QA pairs:\n")
for ex in random.sample(all_qa, min(5, len(all_qa))):
    print("Source:", ex["source"])
    print("Section:", ex["section_title"])
    print("Q:", ex["instruction"])
    print("A:", ex["output"][:300], "...\n---\n")

Sample QA pairs:

Source: book_islr
Section: 524 12. Unsupervised Learning
Q: When or why would a data scientist use 12. unsupervised learning?
A: Data Step 1 Iteration 1, Step 2a
Iteration 1, Step 2b Iteration 2, Step 2a Final Results
FIGURE 12.8. The progress of the K-means algorithm on the example of
Figure 12.7 with K=3. Top left: the observations are shown. Top center: in
Step 1 of the algorithm, each observation is randomly assigned to a ...
---

Source: book_fods
Section: 1 1
Q: Explain 1 in simple terms for a beginner in data science.
A: A = L + R and UΣVT being the singular value decomposition of A. This can be done
using Lagrange multipliers (??). Write R = R+ +R− where R+ ≥ 0 and R− ≥ 0. ...
---

Source: book_fods
Section: 2 i
Q: When or why would a data scientist use i?
A: p ji 1 2 0.85π i π i = 0.85π j p ji + 0. 2 85π i
j i
π = 1.48π p
i j ji
0.15π 0.15π
j i
Figure 4.13: Impact on pagerank of adding a self loop
list of webpages in response to each search query. To do this,

In [22]:
print("Total QA pairs:", len(all_qa))

train, test = train_test_split(all_qa, test_size=0.1, random_state=42)
train, val  = train_test_split(train,   test_size=0.1, random_state=42)

print("Train:", len(train), "Val:", len(val), "Test:", len(test))

with open("ds_train.json", "w", encoding="utf-8") as f:
    json.dump(train, f, indent=2, ensure_ascii=False)

with open("ds_val.json", "w", encoding="utf-8") as f:
    json.dump(val, f, indent=2, ensure_ascii=False)

with open("ds_test.json", "w", encoding="utf-8") as f:
    json.dump(test, f, indent=2, ensure_ascii=False)

print("Saved ds_train.json, ds_val.json, ds_test.json")

Total QA pairs: 7890
Train: 6390 Val: 711 Test: 789
Saved ds_train.json, ds_val.json, ds_test.json
