<a href="https://colab.research.google.com/github/Agniva2006/llm_fine_tuning-google-flan-t5-base-220M-params-/blob/main/datasetMaker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wikipedia
!pip install datasets

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=c5a769d40712b11b5e6c51bd6251084abd3abd4ccb16fd3b29b602324639fd9b
  Stored in directory: /root/.cache/pip/wheels/63/47/7c/a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [2]:
import requests
from bs4 import BeautifulSoup
import re
import wikipedia
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

In [3]:
def clean_text(text):
    """Clean text by removing references, extra spaces, URLs"""
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'\([^\)]*\)', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [4]:

dataset = load_dataset("pubmed_qa", "pqa_labeled")

cancer_keywords = ["cancer", "tumor", "carcinoma", "leukemia", "melanoma"]

def is_cancer_related(example):
    return any(k.lower() in example["question"].lower() for k in cancer_keywords)

cancer_pubmed = dataset["train"].filter(is_cancer_related)

pubmed_qas = []
for ex in cancer_pubmed:
    answer = ex['long_answer'] if ex['long_answer'] else ex['final_decision']
    pubmed_qas.append({
        "question": ex['question'],
        "context": answer,
        "answer": answer
    })
print(f" PubMed cancer Q&A examples: {len(pubmed_qas)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

 PubMed cancer Q&A examples: 114


In [5]:
def scrape_nci_faq(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    questions, answers = [], []
    for q_tag in soup.select("h3"):
        a_tag = q_tag.find_next_sibling("p")
        if a_tag:
            questions.append(clean_text(q_tag.get_text(strip=True)))
            answers.append(clean_text(a_tag.get_text(strip=True)))
    return [{"question": q, "context": a, "answer": a} for q, a in zip(questions, answers)]

nci_url = "https://www.cancer.gov/about-cancer/understanding/what-is-cancer"
nci_qas = scrape_nci_faq(nci_url)
print(f" NCI FAQ examples: {len(nci_qas)}")

 NCI FAQ examples: 8


In [16]:
def scrape_wikipedia(topic="Cancer", max_paras=10000):
    wikipedia.set_lang("en")
    search_results = wikipedia.search(topic)
    wiki_texts = []
    for result in search_results[:5000]:
        try:
            page = wikipedia.page(result)
            paras = page.content.split("\n")
            for p in paras:
                p_clean = clean_text(p)
                if len(p_clean.split()) > 10:
                    wiki_texts.append({
                        "question": f"What does the following text explain about {result}?",
                        "context": p_clean,
                        "answer": p_clean
                    })
                    if len(wiki_texts) >= max_paras:
                        break
        except:
            pass
    return wiki_texts

wiki_qas = scrape_wikipedia(max_paras=50000)
print(f"Wikipedia examples: {len(wiki_qas)}")

Wikipedia examples: 454


In [15]:
def scrape_cleveland():
    url = "https://my.clevelandclinic.org/health/diseases/12194-cancer"
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")
    paras = soup.find_all("p")
    texts = []
    for p in paras:
        t = clean_text(p.get_text())
        if len(t.split()) > 10:
            texts.append({
                "question": "What does this text explain about cancer?",
                "context": t,
                "answer": t
            })
    return texts

cleveland_qas = scrape_cleveland()
print(f"Cleveland Clinic examples: {len(cleveland_qas)}")

Cleveland Clinic examples: 39


In [8]:
def scrape_mayo():
    url = "https://www.mayoclinic.org/diseases-conditions/cancer/symptoms-causes/syc-20370588"
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")
    paras = soup.find_all("p")
    texts = []
    for p in paras:
        t = clean_text(p.get_text())
        if len(t.split()) > 10:
            texts.append({
                "question": "What does this text explain about cancer?",
                "context": t,
                "answer": t
            })
    return texts

mayo_qas = scrape_mayo()
print(f"✅ Mayo Clinic examples: {len(mayo_qas)}")

✅ Mayo Clinic examples: 25


In [17]:
def scrape_webmd():
    url = "https://www.webmd.com/cancer/default.htm"
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")
    paras = soup.find_all("p")
    texts = []
    for p in paras:
        t = clean_text(p.get_text())
        if len(t.split()) > 10:
            texts.append({
                "question": "What does this text explain about cancer?",
                "context": t,
                "answer": t
            })
    return texts

webmd_qas = scrape_webmd()
print(f"WebMD examples: {len(webmd_qas)}")

WebMD examples: 46


In [18]:
all_qas = pubmed_qas + nci_qas + wiki_qas + cleveland_qas + mayo_qas + webmd_qas
print(f" Total combined Q&A examples: {len(all_qas)}")

 Total combined Q&A examples: 686


In [22]:
!pip install transformers tqdm sentencepiece --quiet

from transformers import pipeline
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [20]:
# Initialize T5 paraphraser / question generator
qgen = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")

augmented_qas = []

# Limit number per source if dataset is huge (optional)
sample_limit = None  # or set e.g., 500 for testing

qas_to_use = all_qas if sample_limit is None else all_qas[:sample_limit]

print("🔹 Generating questions using T5...")
for item in tqdm(qas_to_use):
    context = item['context']
    # Generate a new question based on the context
    try:
        new_q = qgen(f"generate question: {context}", max_length=64)[0]['generated_text']
        augmented_qas.append({
            "question": new_q,
            "context": context,
            "answer": item['answer']
        })
    except:
        # fallback if generation fails
        augmented_qas.append(item)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0


🔹 Generating questions using T5...


  0%|          | 0/686 [00:00<?, ?it/s]Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  0%|          | 1/686 [00:03<40:31,  3.55s/it]Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  0%|          | 2/686 [00:04<25:17,  2.22s/it]Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  0%|          | 3/686 [00:06<21:39,  1.90s/it]Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set.

In [21]:
final_qas = all_qas + augmented_qas
print(f" Total Q&A examples after T5 generation: {len(final_qas)}")

 Total Q&A examples after T5 generation: 1372


In [23]:
import random
random.shuffle(final_qas)

# Split 90% train, 10% validation
train_qas, val_qas = train_test_split(final_qas, test_size=0.1, random_state=42)

print(f"✅ Train examples: {len(train_qas)}, Validation examples: {len(val_qas)}")


✅ Train examples: 1234, Validation examples: 138


In [24]:
def save_jsonl(dataset, filename):
    import json
    with open(filename, "w", encoding="utf-8") as f:
        for item in dataset:
            json_line = {
                "instruction": item["question"],
                "input": "",
                "output": item["answer"]
            }
            f.write(json.dumps(json_line, ensure_ascii=False) + "\n")

# Save train and validation separately
save_jsonl(train_qas, "cancer_train.jsonl")
save_jsonl(val_qas, "cancer_val.jsonl")