# Creating your own dataset

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

This notebook demonstrates how to create your own dataset by collecting data from APIs, processing it, and sharing it on the Hugging Face Hub.

In [None]:
# Install required packages for dataset creation and Git LFS
# Git LFS is needed for uploading large files to the Hub
!uv pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

**Important:** Configure Git with your actual email and name before proceeding.

In [None]:
# Configure Git with your credentials
# Replace with your actual email and name
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

**Required:** You must authenticate with Hugging Face Hub to upload datasets.

In [None]:
# Login to Hugging Face Hub
# This will prompt for your username and token/password
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Install requests library for API calls
# We'll use this to fetch data from the GitHub API
!uv pip install requests

In [None]:
# Make an API call to GitHub's REST API
# This gets the first page of issues from the datasets repository
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [None]:
# Check if the API call was successful
# Status code 200 means the request succeeded
response.status_code

In [None]:
# Examine the JSON response structure
# This shows the format of GitHub issue data
response.json()

In [None]:
# Set up GitHub authentication for higher rate limits
# Replace 'xxx' with your actual GitHub personal access token
GITHUB_TOKEN = xxx  # Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [None]:
# Function to fetch all issues from a GitHub repository
# This handles pagination and rate limiting automatically
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
# Execute the data collection function
# This may take several minutes depending on your internet connection
# Depending on your internet connection, this can take several minutes to run...
fetch_issues()

In [None]:
# Load the collected issues data into a Dataset
# This converts our JSONL file into a Hugging Face Dataset
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

In [None]:
# Explore the dataset to understand pull requests vs issues
# GitHub API returns both issues and pull requests in the same endpoint
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

In [None]:
# Add a feature to distinguish issues from pull requests
# This creates a boolean column for easier filtering
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

In [None]:
# Fetch comments for a specific issue
# This demonstrates getting additional data via API calls
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

In [None]:
# Create a function to extract comment text from issues
# This function handles the API call and extracts only the comment body
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]

# Test our function works as expected
get_comments(2792)

In [None]:
# Add comments to each issue in the dataset
# This enriches the dataset with comment data for better analysis
# Depending on your internet connection, this can take a few minutes...
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)

In [None]:
# Re-authenticate with Hugging Face Hub (if needed)
# Sometimes authentication expires during long processing
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Upload the dataset to Hugging Face Hub
# This makes your dataset publicly available and shareable
issues_with_comments_dataset.push_to_hub("github-issues")

In [None]:
# Load the dataset back from the Hub to verify upload
# This demonstrates how others can access your published dataset
remote_dataset = load_dataset("lewtun/github-issues", split="train")
remote_dataset