In [17]:
!pip install datasets arxiv stackapi bs4 nltk transformers wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=ebb8ea12136b7a490c82885e504d32b153cbe6132ccf32054d6be2b24e2693c2
  Stored in directory: /root/.cache/pip/wheels/63/47/7c/a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [18]:
import json
import random
import re
from typing import List, Dict
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import arxiv
from stackapi import StackAPI
import requests
from bs4 import BeautifulSoup
import wikipedia

In [19]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [20]:
def clean_text(text: str) -> str:
    """Clean and normalize text"""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    return text.strip()

def chunk_text(text: str, max_length: int = 1000) -> List[str]:
    """Split text into chunks of approximately max_length characters"""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sent in sentences:
        sent_len = len(sent)
        if current_length + sent_len > max_length and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sent]
            current_length = sent_len
        else:
            current_chunk.append(sent)
            current_length += sent_len

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def create_qa_pairs(text: str, context: str = "", source: str = "") -> List[Dict]:
    """Generate QA pairs from text using various templates"""
    chunks = chunk_text(text, max_length=800)
    qa_pairs = []

    templates = [
        "Explain the following concept: ",
        "What does this mean: ",
        "Can you summarize this: ",
        "Provide more details about: ",
        "What is the main idea of: "
    ]

    for chunk in chunks:
        if len(chunk) < 100:  # Skip very short chunks
            continue

        # Extract first sentence as potential topic
        first_sent = sent_tokenize(chunk)[0] if sent_tokenize(chunk) else chunk[:100]

        qa_pairs.append({
            "instruction": random.choice(templates) + first_sent[:100],
            "input": "",
            "output": chunk,
            "source": source,
            "context": context
        })

    return qa_pairs

In [28]:
def fetch_common_crawl(num_samples: int = 1000) -> List[Dict]:
    """Fetch data from Common Crawl via HuggingFace datasets"""
    print(f"Fetching {num_samples} samples from Common Crawl...")
    qa_pairs = []

    try:
        # Using alternative cleaned web datasets
        # Option 1: Try RedPajama
        try:
            dataset = load_dataset("togethercomputer/RedPajama-Data-1T-Sample", split="train", streaming=True)
            count = 0
            for item in dataset:
                if count >= num_samples:
                    break

                text = clean_text(item['text'])
                if len(text) > 200:
                    pairs = create_qa_pairs(
                        text[:2000],
                        context="web_content",
                        source="common_crawl"
                    )
                    qa_pairs.extend(pairs)
                    count += 1

                if count % 100 == 0:
                    print(f"  Processed {count}/{num_samples} Common Crawl samples")

            print(f"  Generated {len(qa_pairs)} QA pairs from Common Crawl (RedPajama)")
            return qa_pairs

        except:
            # Option 2: Try Wikipedia dataset as fallback
            print("  RedPajama unavailable, using Wikipedia dataset as web content alternative...")
            dataset = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
            count = 0
            for item in dataset:
                if count >= num_samples:
                    break

                text = clean_text(item['text'])
                if len(text) > 200:
                    pairs = create_qa_pairs(
                        text[:2000],
                        context="encyclopedic_content",
                        source="common_crawl_alt"
                    )
                    qa_pairs.extend(pairs)
                    count += 1

                if count % 100 == 0:
                    print(f"  Processed {count}/{num_samples} samples")

    except Exception as e:
        print(f"  Error fetching Common Crawl alternatives: {e}")
        print("  Skipping Common Crawl - other sources will compensate")

    print(f"  Generated {len(qa_pairs)} QA pairs from web sources")
    return qa_pairs

In [29]:
def fetch_stack_exchange(num_questions: int = 500) -> List[Dict]:
    """Fetch Q&A from Stack Exchange using HuggingFace dataset (no rate limits)"""
    print(f"Fetching {num_questions} questions from Stack Exchange...")
    qa_pairs = []

    try:
        # Use a working Stack Overflow dataset
        dataset = load_dataset("koutch/stackoverflow_python", split="train", streaming=True)

        count = 0
        items_checked = 0
        max_items_to_check = num_questions * 5  # Check at most 5x the target to avoid infinite loops

        for item in dataset:
            items_checked += 1

            if count >= num_questions or items_checked >= max_items_to_check:
                break

            try:
                question = item.get('question', '')
                answer = item.get('answer', '')

                if len(question) > 50 and len(answer) > 100:
                    qa_pairs.append({
                        "instruction": clean_text(question)[:500],
                        "input": "",
                        "output": clean_text(answer)[:1000],
                        "source": "stack_overflow",
                        "context": "programming_qa"
                    })
                    count += 1

                if count % 100 == 0 and count > 0:
                    print(f"  Processed {count}/{num_questions} Stack Overflow QA pairs")

            except Exception as e:
                continue

        print(f"  Checked {items_checked} items, collected {count} valid QA pairs")

    except Exception as e:
        print(f"  Error with Stack Overflow dataset: {e}")
        print(f"  Skipping Stack Exchange - will rely on other sources")

    print(f"  Total Stack Exchange QA pairs: {len(qa_pairs)}")
    return qa_pairs

In [30]:
def fetch_arxiv(num_papers: int = 200) -> List[Dict]:
    """Fetch papers from arXiv using the new Client API"""
    print(f"Fetching {num_papers} papers from arXiv...")
    qa_pairs = []

    # Diverse search queries for general knowledge
    queries = [
        "machine learning",
        "computer science theory",
        "mathematics",
        "physics",
        "artificial intelligence",
        "machinen learnn=ing algorithms",
        "genAI"
    ]

    papers_per_query = num_papers // len(queries)

    # Use the new Client API
    client = arxiv.Client()

    for query in queries:
        try:
            search = arxiv.Search(
                query=query,
                max_results=papers_per_query,
                sort_by=arxiv.SortCriterion.Relevance
            )

            for result in client.results(search):
                title = clean_text(result.title)
                abstract = clean_text(result.summary)

                if len(abstract) > 200:
                    # Create QA pairs from abstract
                    qa_pairs.append({
                        "instruction": f"Explain the research paper: {title}",
                        "input": "",
                        "output": abstract,
                        "source": "arxiv",
                        "context": f"category_{result.primary_category}"
                    })

                    # Create summary QA
                    qa_pairs.append({
                        "instruction": f"What is the main contribution of this paper: {title}",
                        "input": abstract[:300],
                        "output": abstract[300:600] if len(abstract) > 300 else abstract,
                        "source": "arxiv",
                        "context": f"category_{result.primary_category}"
                    })

        except Exception as e:
            print(f"  Error with query '{query}': {e}")

    print(f"  Generated {len(qa_pairs)} QA pairs from arXiv")
    return qa_pairs

In [31]:
def fetch_wikipedia(num_articles: int = 300) -> List[Dict]:
    """Fetch articles from Wikipedia"""
    print(f"Fetching {num_articles} articles from Wikipedia...")
    qa_pairs = []

    # Diverse categories for general knowledge
    categories = [
        # Science & Technology
        "Artificial intelligence", "Computer science", "Physics", "Biology",
        "Chemistry", "Mathematics", "Engineering", "Medicine",
        # History & Geography
        "World history", "Ancient history", "Geography", "Countries",
        # Arts & Culture
        "Literature", "Music", "Art", "Philosophy", "Religion",
        # Social Sciences
        "Psychology", "Economics", "Sociology", "Political science",
        # General
        "Science", "Technology", "History", "Culture"
    ]

    articles_per_category = num_articles // len(categories)
    processed_titles = set()  # Avoid duplicates

    for category in categories:
        try:
            # Search for articles related to the category
            search_results = wikipedia.search(category, results=articles_per_category + 10)

            count = 0
            for title in search_results:
                if count >= articles_per_category:
                    break

                if title in processed_titles:
                    continue

                try:
                    page = wikipedia.page(title, auto_suggest=False)
                    content = clean_text(page.content)
                    summary = clean_text(page.summary)

                    processed_titles.add(title)

                    # Create QA from summary
                    if len(summary) > 200:
                        qa_pairs.append({
                            "instruction": f"What is {title}?",
                            "input": "",
                            "output": summary,
                            "source": "wikipedia",
                            "context": f"category_{category.replace(' ', '_')}"
                        })

                        qa_pairs.append({
                            "instruction": f"Explain {title} in detail.",
                            "input": "",
                            "output": summary,
                            "source": "wikipedia",
                            "context": f"category_{category.replace(' ', '_')}"
                        })

                    # Create QA from content sections
                    # Split content into paragraphs
                    paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 300]

                    # Take a few paragraphs from the article
                    for i, para in enumerate(paragraphs[:3]):
                        if len(para) > 200:
                            # Extract first sentence as potential question topic
                            first_sent = sent_tokenize(para)[0] if sent_tokenize(para) else para[:100]

                            qa_pairs.append({
                                "instruction": f"Tell me about {first_sent[:80]}",
                                "input": "",
                                "output": para[:1000],  # Limit length
                                "source": "wikipedia",
                                "context": f"{title}_section_{i}"
                            })

                    count += 1

                except wikipedia.exceptions.DisambiguationError as e:
                    # If disambiguation page, try first option
                    try:
                        if e.options:
                            page = wikipedia.page(e.options[0], auto_suggest=False)
                            summary = clean_text(page.summary)
                            if len(summary) > 200:
                                qa_pairs.append({
                                    "instruction": f"What is {e.options[0]}?",
                                    "input": "",
                                    "output": summary,
                                    "source": "wikipedia",
                                    "context": f"category_{category.replace(' ', '_')}"
                                })
                                count += 1
                    except:
                        continue

                except wikipedia.exceptions.PageError:
                    continue

                except Exception as e:
                    continue

            print(f"  Processed {count} articles from category '{category}'")

        except Exception as e:
            print(f"  Error with category '{category}': {e}")

    print(f"  Generated {len(qa_pairs)} QA pairs from Wikipedia")
    return qa_pairs

In [32]:
def main():
    print("=" * 60)
    print("GENERALIZED DATASET CREATION FOR LLM TRAINING")
    print("=" * 60)

    all_qa = []


    # 1. Common Crawl / Web Content (using alternative datasets)
    cc_data = fetch_common_crawl(num_samples=300)
    all_qa.extend(cc_data)

    # 2. Stack Exchange (using HuggingFace datasets to avoid rate limits)
    se_data = fetch_stack_exchange(num_questions=800)
    all_qa.extend(se_data)

    # 3. arXiv (academic papers)
    arxiv_data = fetch_arxiv(num_papers=900)
    all_qa.extend(arxiv_data)

    # 4. Wikipedia (encyclopedic knowledge)
    wiki_data = fetch_wikipedia(num_articles=300)
    all_qa.extend(wiki_data)


    print("\n" + "=" * 60)
    print(f"TOTAL QA PAIRS COLLECTED: {len(all_qa)}")
    print("=" * 60)

    # Distribution stats
    sources = {}
    for qa in all_qa:
        src = qa['source']
        sources[src] = sources.get(src, 0) + 1

    print("\nData distribution by source:")
    for src, count in sorted(sources.items()):
        print(f"  {src}: {count} pairs ({count/len(all_qa)*100:.1f}%)")

    # Split into train/val/test
    train, test = train_test_split(all_qa, test_size=0.1, random_state=42)
    train, val = train_test_split(train, test_size=0.1, random_state=42)

    print(f"\nSplit sizes:")
    print(f"  Train: {len(train)} ({len(train)/len(all_qa)*100:.1f}%)")
    print(f"  Validation: {len(val)} ({len(val)/len(all_qa)*100:.1f}%)")
    print(f"  Test: {len(test)} ({len(test)/len(all_qa)*100:.1f}%)")

    # Save datasets
    with open("general_train.json", "w", encoding="utf-8") as f:
        json.dump(train, f, indent=2, ensure_ascii=False)

    with open("general_val.json", "w", encoding="utf-8") as f:
        json.dump(val, f, indent=2, ensure_ascii=False)

    with open("general_test.json", "w", encoding="utf-8") as f:
        json.dump(test, f, indent=2, ensure_ascii=False)

    print("\n✓ Saved: general_train.json, general_val.json, general_test.json")

    # Show sample
    print("\n" + "=" * 60)
    print("SAMPLE QA PAIRS:")
    print("=" * 60)
    for qa in random.sample(all_qa, min(5, len(all_qa))):
        print(f"\nSource: {qa['source']}")
        print(f"Context: {qa['context']}")
        print(f"Q: {qa['instruction'][:100]}...")
        print(f"A: {qa['output'][:200]}...")
        print("-" * 60)

if __name__ == "__main__":
    main()

GENERALIZED DATASET CREATION FOR LLM TRAINING
Fetching 300 samples from Common Crawl...
  RedPajama unavailable, using Wikipedia dataset as web content alternative...
  Error fetching Common Crawl alternatives: Dataset scripts are no longer supported, but found wikipedia.py
  Skipping Common Crawl - other sources will compensate
  Generated 0 QA pairs from web sources
Fetching 800 questions from Stack Exchange...


README.md: 0.00B [00:00, ?B/s]

  Checked 4000 items, collected 0 valid QA pairs
  Total Stack Exchange QA pairs: 0
Fetching 900 papers from arXiv...
  Generated 1748 QA pairs from arXiv
Fetching 300 articles from Wikipedia...
  Processed 12 articles from category 'Artificial intelligence'




  lis = BeautifulSoup(html).find_all('li')


  Processed 12 articles from category 'Computer science'
  Processed 12 articles from category 'Physics'
  Processed 12 articles from category 'Biology'
  Processed 12 articles from category 'Chemistry'
  Processed 12 articles from category 'Mathematics'
  Processed 12 articles from category 'Engineering'
  Processed 12 articles from category 'Medicine'
  Processed 12 articles from category 'World history'
  Processed 12 articles from category 'Ancient history'
  Processed 12 articles from category 'Geography'
  Processed 12 articles from category 'Countries'
  Processed 12 articles from category 'Literature'
  Processed 12 articles from category 'Music'
  Processed 12 articles from category 'Art'
  Processed 12 articles from category 'Philosophy'
  Processed 12 articles from category 'Religion'
  Processed 12 articles from category 'Psychology'
  Processed 12 articles from category 'Economics'
  Processed 12 articles from category 'Sociology'
  Processed 12 articles from category 'Pol