In [25]:
import time
import math
from pathlib import Path
import requests
import pandas as pd
import datasets
from tqdm.notebook import tqdm
from huggingface_hub import notebook_login


# Define the URL for fetching issues from the Hugging Face transformers repository
url = "https://api.github.com/repos/huggingface/transformers/issues?page=1&per_page=1"

# Send a GET request to the specified URL and store the response
response = requests.get(url)

In [26]:
# Get the JSON response from the API to see what the response looks like
response.json()

[{'url': 'https://api.github.com/repos/huggingface/transformers/issues/29225',
  'repository_url': 'https://api.github.com/repos/huggingface/transformers',
  'labels_url': 'https://api.github.com/repos/huggingface/transformers/issues/29225/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/transformers/issues/29225/comments',
  'events_url': 'https://api.github.com/repos/huggingface/transformers/issues/29225/events',
  'html_url': 'https://github.com/huggingface/transformers/pull/29225',
  'id': 2150031292,
  'node_id': 'PR_kwDOCUB6oc5nsvBf',
  'number': 29225,
  'title': 'Fix `kwargs` handling in `generate_with_fallback`',
  'user': {'login': 'cifkao',
   'id': 8046580,
   'node_id': 'MDQ6VXNlcjgwNDY1ODA=',
   'avatar_url': 'https://avatars.githubusercontent.com/u/8046580?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/cifkao',
   'html_url': 'https://github.com/cifkao',
   'followers_url': 'https://api.github.com/users/cifkao/followers',
 

In [27]:
# GitHub token for authorization
GITHUB_TOKEN = ''  # Copy your GitHub token here

# Headers for API request
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [28]:
# create a function that can download all the issues from a GitHub repository.
# 5,000 requests per hour is the GitHub rate limit for users

def fetch_issues(
    owner="huggingface",
    repo="transformers",
    num_issues=30000,
    rate_limit=5000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = (
        pd.DataFrame.from_records(all_issues)
        .dropna(axis=1, how="all")
    )
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
# Depending on your internet connection and the size of the dataset, this can take several minutes to hours to run...
fetch_issues()

In [30]:
# Once the issues are downloaded we can load them locally

issues_dataset = datasets.load_dataset(
    "json", data_files="transformers-issues.jsonl", split="train"
)
print(issues_dataset)

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'state_reason', 'draft', 'pull_request'],
    num_rows: 28908
})


In [31]:
sample = issues_dataset.shuffle(seed=777).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/transformers/issues/28316
>> Pull request: None

>> URL: https://github.com/huggingface/transformers/pull/5941
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/transformers/pulls/5941', 'html_url': 'https://github.com/huggingface/transformers/pull/5941', 'diff_url': 'https://github.com/huggingface/transformers/pull/5941.diff', 'patch_url': 'https://github.com/huggingface/transformers/pull/5941.patch', 'merged_at': None}

>> URL: https://github.com/huggingface/transformers/pull/17329
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/transformers/pulls/17329', 'html_url': 'https://github.com/huggingface/transformers/pull/17329', 'diff_url': 'https://github.com/huggingface/transformers/pull/17329.diff', 'patch_url': 'https://github.com/huggingface/transformers/pull/17329.patch', 'merged_at': datetime.datetime(2022, 5, 18, 17, 7, 49)}



In [32]:
# Map the dataset to add a new key-value pair indicating whether the issue is a pull request
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

In [33]:
# Filter the dataset to include only closed issues that are not pull requests
filtered_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and x["state"] == "closed")
)
# Print the filtered dataset
filtered_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 13320
})

In [34]:
# Shuffle the dataset with a specific seed and select a range of columns
sample = filtered_dataset.shuffle(seed=777).select(range(3))

# Print out a sample of the the created_at and closed_at entries
for created, closed in zip(
    sample["created_at"], sample["closed_at"]
):
    print(f">> created_at: {created}")
    print(f">> closed_at: {closed}\n")

>> created_at: 2023-07-26 14:42:05
>> closed_at: 2023-08-01 09:53:30

>> created_at: 2023-01-13 14:12:37
>> closed_at: 2023-01-16 10:00:32

>> created_at: 2020-04-17 21:35:44
>> closed_at: 2020-08-27 05:17:28



In [35]:
# Map the dataset to calculate the time taken to close
filtered_dataset = filtered_dataset.map(
    lambda x: {"time_to_close": x["closed_at"] -x["created_at"]}
)

# Shuffle the dataset and select a sample of 3
sample = filtered_dataset.shuffle(seed=777).select(range(3))

# Iterate through the sample and print the created_at, closed_at, and time_to_close
for created, closed, time_to_close in zip(
    sample["created_at"], sample["closed_at"], sample["time_to_close"]
):
    print(f">> created_at: {created}")
    print(f">> closed_at: {closed}")
    print(f">> time_to_close: {time_to_close}\n")

>> created_at: 2023-07-26 14:42:05
>> closed_at: 2023-08-01 09:53:30
>> time_to_close: 5 days, 19:11:25

>> created_at: 2023-01-13 14:12:37
>> closed_at: 2023-01-16 10:00:32
>> time_to_close: 2 days, 19:47:55

>> created_at: 2020-04-17 21:35:44
>> closed_at: 2020-08-27 05:17:28
>> time_to_close: 131 days, 7:41:44



In [36]:
# Calculate the mean of the "time_to_close" column in the filtered dataset
mean = filtered_dataset.with_format("pandas")["time_to_close"].mean()

# Print the calculated mean
print(f'Average time to close issues in Transformers library that are not pull requests: {mean}')

Average time to close issues in Transformers library that are not pull requests: 38 days 20:03:59.734009


In [37]:
# Filter for closed pull requests
pr_filtered_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == True and x["state"] == "closed")
)
# Calculate time taken to close pull requests
pr_filtered_dataset = pr_filtered_dataset.map(
    lambda x: {"time_to_close_pr": x["closed_at"] - x["created_at"]}
)
# Select a sample of 3 records and shuffle
sample = pr_filtered_dataset.shuffle(seed=777).select(range(3))

# Print information for each record in the sample
for created, closed, time_to_close_pr in zip(
    sample["created_at"], sample["closed_at"], sample["time_to_close_pr"]
):
    print(f">> created_at: {created}")
    print(f">> closed_at: {closed}")
    print(f">> time_to_close_pr: {time_to_close_pr}\n")

# Calculate mean time to close pull requests
mean = pr_filtered_dataset.with_format("pandas")["time_to_close_pr"].mean()

# Print the mean time to close pull requests
print(f"Average time to close Pull Requests in Transformers library: {mean}")

>> created_at: 2021-07-15 13:18:53
>> closed_at: 2021-07-15 15:40:17
>> time_to_close_pr: 2:21:24

>> created_at: 2021-06-09 17:19:10
>> closed_at: 2021-06-09 17:49:27
>> time_to_close_pr: 0:30:17

>> created_at: 2019-12-10 13:53:28
>> closed_at: 2019-12-10 14:13:34
>> time_to_close_pr: 0:20:06

Average time to close Pull Requests in Transformers library: 10 days 17:56:28.009275


In [38]:
# Augmenting the dataset with issue comments  
# 5,000 requests per hour is the GitHub rate limit for users

def get_comments(issue_number):
    url = (f"https://api.github.com/repos/huggingface/transformers/issues/{issue_number}/comments")
    try:
        response = requests.get(url, headers=headers)
        return [r["body"] for r in response.json()]
    except:
        print(f"Error generating comments for issue {issue_number}.\nResponse:\n {response.text}")
        print(f"Reached GitHub rate limit. Sleeping for one hour ...")
        time.sleep(60 * 60 + 1)
        return []


In [None]:
# Mapping the issues dataset to include comments for each issue
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments((x["number"]))}
)

In [None]:
# Convert the 'issues_with_comments_dataset' DataFrame to JSON format and save it to a file
# with each record as a separate line
issues_with_comments_dataset.to_json(
    "transformers-issues-with-comments.jsonl", orient="records", lines=True
)

In [39]:
# Test loading the dataset from the json file
transformers_issues_w_comments = datasets.load_dataset(
    "json", data_files="transformerst-issues-with-comments.jsonl", split="train"
)

print(issues_dataset)

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 28908
})


In [None]:
# Log in to the Hugging Face to push dataset to hub
from huggingface_hub import notebook_login

notebook_login()

In [24]:
# Push the dataset to the Hugging Face hub
transformers_issues_w_comments.push_to_hub(
    "DanielPFlorian/Transformers-Github-Issues"
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DanielPFlorian/Transformers-Github-Issues/commit/883eacd7d4253bf6bd815e0471097ae474bc1900', commit_message='Upload dataset', commit_description='', oid='883eacd7d4253bf6bd815e0471097ae474bc1900', pr_url=None, pr_revision=None, pr_num=None)

In [40]:
# Test loading the dataset from the Hugging Face hub
remote_dataset = datasets.load_dataset("DanielPFlorian/transformers-github-issues", split="train")
remote_dataset

Downloading readme:   0%|          | 0.00/4.63k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/49.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28908 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'state_reason', 'draft', 'pull_request'],
    num_rows: 28908
})