In [19]:
import os 
import numpy as np 
import pandas as pd 
import requests 
import sys
from dotenv import load_dotenv
import json

In [20]:
data_path = "nlbse"

df = pd.read_csv(os.path.join(data_path, "issues_train.csv"))

print(df["repo"].unique())



['facebook/react' 'tensorflow/tensorflow' 'microsoft/vscode'
 'bitcoin/bitcoin' 'opencv/opencv']


In [21]:
react = df[df["repo"] == "facebook/react"]
tf = df[df["repo"] == "tensorflow/tensorflow"]
vscode =  df[df["repo"] == "microsoft/vscode"]
btc =  df[df["repo"] == "bitcoin/bitcoin"]
cv = df[df["repo"] == "opencv/opencv"]

In [22]:
def get_github_token():
    load_dotenv()
    token = os.getenv("GITHUB_TOKEN")
    if token is None:
        print("GITHUB_TOKEN not found in environment variables.")
        sys.exit(1)
    return token

In [23]:
def fetch_issues_from_nlbse(repo_full_name, token, per_page=100, max_issues=500, created_at=None, title=None):
    url = f"https://api.github.com/repos/{repo_full_name}/issues"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
        "User-Agent": "simple-github-issues-script",
    }
    all_issues = []
    page = 1
    params = {
        "created": created_at,
        "page": page,
        "sort": "created",
        "direction": "desc",
    }

    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        print(f"Failed to fetch issues: {response.status_code}")
        return []
    issues = response.json()
    all_issues.extend(issues)
    return all_issues

In [24]:
react_issues = react.apply(lambda row: fetch_issues_from_nlbse("facebook/react", get_github_token(), created_at=row["created_at"]), axis=1)
react_issues.to_json(os.path.join(data_path, "react_issues.json"), orient="records", lines=True)

tf_issues = tf.apply(lambda row: fetch_issues_from_nlbse("tensorflow/tensorflow", get_github_token(), created_at=row["created_at"]), axis=1)
tf_issues.to_json(os.path.join(data_path, "tf_issues.json"), orient="records", lines=True)

vscode_issues = vscode.apply(lambda row: fetch_issues_from_nlbse("microsoft/vscode", get_github_token(), created_at=row["created_at"]), axis=1)
vscode_issues.to_json(os.path.join(data_path, "vscode_issues.json"), orient="records", lines=True)  

btc_issues = btc.apply(lambda row: fetch_issues_from_nlbse("bitcoin/bitcoin", get_github_token(), created_at=row["created_at"]), axis=1)
btc_issues.to_json(os.path.join(data_path, "btc_issues.json"), orient="records", lines=True)

cv_issues = cv.apply(lambda row: fetch_issues_from_nlbse("opencv/opencv", get_github_token(), created_at=row["created_at"]), axis=1)
cv_issues.to_json(os.path.join(data_path, "cv_issues.json"), orient="records", lines=True)  