## Repository selection
We want to list and select repositories high number of CVEs references to them.   

In [1]:
import pandas as pd
from  datasets import load_dataset

ds = load_dataset('Eathus/cve-references-list', split='train')
references_df = ds.to_pandas()

ds = load_dataset('Eathus/filtered-vulnerabilities', split='train')
vulnerabilities_df = ds.to_pandas()

In [2]:
ref_df = references_df[references_df['url'].str.contains('github.*issues')]
#ref_df = ref_df[pd.to_datetime(ref_df.published) > pd.to_datetime('2023-05-01')]

In [3]:
import re 
from tqdm import tqdm

repos = set()

for id, url in tqdm(ref_df[['id','url']].itertuples(index=False)):
    
    pattern = r'https://github.com/([^/]+)/([^/]+)/issues/'
    match = re.match(pattern, url)
    if match:
        owner = match.group(1)
        repo = match.group(2)
        repos.add((owner, repo, id))
        
print(len(repos))

7509it [00:00, 365312.23it/s]

7206





In [4]:
df = pd.DataFrame(repos, columns=['owner', 'repo_name', 'cve'])
df_group = df.groupby(['owner', 'repo_name']).count()
df_group_sorted = df_group.sort_values(by='cve', ascending=False)
print(sum(df_group_sorted.cve.tolist()))
top_50_groups = df_group_sorted.head(50)
top_50_groups.head(50)
print(sum(top_50_groups.cve.tolist()))
display(df_group_sorted.head(50))

7206
1861


Unnamed: 0_level_0,Unnamed: 1_level_0,cve
owner,repo_name,Unnamed: 2_level_1
gpac,gpac,261
axiomatic-systems,Bento4,93
jerryscript-project,jerryscript,86
ImageMagick,ImageMagick,74
cesanta,mjs,73
LibreDWG,libredwg,67
matthiaskramm,swftools,65
strukturag,libde265,52
FasterXML,jackson-databind,49
odoo,odoo,47


In [5]:
repos_set = set(df_group_sorted.index)
display(repos_set)

{('buger', 'jsonparser'),
 ('miroslavpejic85', 'mirotalk'),
 ('PrivateBin', 'PrivateBin'),
 ('IBAX-io', 'go-ibax'),
 ('docsifyjs', 'docsify'),
 ('SublimeTextIssues', 'Core'),
 ('eclipse-ee4j', 'mojarra'),
 ('excellentoldtv', 'portfolioCMS-issues'),
 ('george518', 'PPGo_Job'),
 ('pagehelper', 'Mybatis-PageHelper'),
 ('firecracker-microvm', 'firecracker'),
 ('theupdateframework', 'tuf'),
 ('nicolas-van', 'modern-async'),
 ('galkahana', 'HummusJS'),
 ('grayfullbuster0804', 'netbox'),
 ('ossec', 'ossec-hids'),
 ('democritus-project', 'd8s-math'),
 ('sparklemotion', 'nokogiri'),
 ('zhaopengme', 'gitnote'),
 ('TDuckCloud', 'tduck-platform'),
 ('ff4j', 'ff4j'),
 ('vitejs', 'vite'),
 ('kubernetes-sigs', 'secrets-store-csi-driver'),
 ('ZoneMinder', 'zoneminder'),
 ('axios', 'axios'),
 ('antirez', 'kilo'),
 ('dromara', 'lamp-cloud'),
 ('mhart', 'StringStream'),
 ('poropro', 'kuaifan'),
 ('GodEpic', 'chaojicms'),
 ('kata-containers', 'kata-containers'),
 ('redis', 'redis-py'),
 ('maybe-why-not', 

## Select all the issues/pull form  CVEs references that are linked to one of those repos 

In [6]:
filtered_references_df_tmp = references_df[
    references_df['url'].str.contains('github.com/[^/]+/[^/]+/issues/[0-9]+', regex=True)].copy()


def funct(url):
    pattern = r'https://github.com/([^/]+)/([^/]+)/'
    match = re.match(pattern, url)
    if match:
        owner = match.group(1)
        repo = match.group(2)
        return (owner, repo)
    return None


filtered_references_df_tmp['owner_repo'] = filtered_references_df_tmp['url'].map(funct)
filtered_references_df = filtered_references_df_tmp[filtered_references_df_tmp.owner_repo.isin(repos_set)]
filtered_references_df.sample(5)

Unnamed: 0,id,published,url,tags,domain,__index_level_0__,owner_repo
110212,CVE-2022-45970,2022-12-12T14:15:10.557,https://github.com/alist-org/alist/issues/2457,"[Exploit, Third Party Advisory]",github.com,365023,"(alist-org, alist)"
103229,CVE-2022-40426,2022-09-19T16:15:11.893,https://github.com/democritus-project/d8s-asns...,"[Exploit, Issue Tracking, Third Party Advisory]",github.com,343692,"(democritus-project, d8s-asns)"
90156,CVE-2022-28448,2022-04-26T20:15:35.780,https://github.com/nopSolutions/nopCommerce/is...,"[Exploit, Issue Tracking, Third Party Advisory]",github.com,305187,"(nopSolutions, nopCommerce)"
119696,CVE-2023-26957,2023-03-09T21:15:10.963,https://github.com/keheying/onekeyadmin/issues/3,"[Exploit, Issue Tracking]",github.com,391993,"(keheying, onekeyadmin)"
122670,CVE-2020-19693,2023-04-04T15:15:07.697,https://github.com/espruino/Espruino/issues/1684,"[Exploit, Issue Tracking, Patch]",github.com,400046,"(espruino, Espruino)"


In [7]:
print('filtered_references_df len:\t', len(filtered_references_df))

filtered_references_df len:	 7454


## Scrape issues from GitHub referenced Issues

In [None]:
import requests
import os
USER_MAIN = os.getenv("GITHUB_USER_MAIN")
USER_PRIV = os.getenv("GITHUB_USER_PRIV")
USER_SCHOOL = os.getenv("GITHUB_USER_SCHOOL")

API_KEY_MAIN = os.getenv("GITHUB_API_KEY_MAIN")
API_KEY_PRIV = os.getenv("GITHUB_API_KEY_PRIV")
API_KEY_SCHOOL = os.getenv("GITHUB_API_KEY_SCHOOL")
counter = 0
dict_auth = [
    {
        'user': USER_MAIN,
        'secret': API_KEY_MAIN

    },
    {
        'user': USER_PRIV,
        'secret': API_KEY_PRIV
    },
    {
        'user': USER_SCHOOL,
        'secret': API_KEY_SCHOOL
    }
]


def get_issue(url_html):
    
    url = url_html.replace("github.com", "api.github.com/repos")
    return _getter(url, url_html)


def _getter(url, html_url):
    global counter

    counter += 1
    auth = (dict_auth[counter % 3]['user'], dict_auth[counter % 3]['secret'])
    response = requests.get(url, auth=auth)

    if response.status_code != 200:
        print(f"Error for URL: {url}")
        print(f"Status Code: {response.status_code}")
        print(f"Response: {response.json()}")
        return  # Skip this URL
    
    res = response.json()
    try:
        yield {
            "url": html_url,
            "body": res['body'],
            "title": res['title'],
            "comments_url": res['comments_url'],
            "comments_count": res['comments'],
            "created_at": res['created_at'],
            "updated_at": res["updated_at"],
            "html_url": res["html_url"],
            "github_id": res["id"],
            "number": res["number"]
    
        }
    except Exception as e:
        print(url)
        print(f"Exception: {e}")


In [None]:
data = []

for index, row in tqdm(filtered_references_df.iterrows()):
    for item in get_issue(row['url']):
        data.append(item)

In [None]:
display(pd.DataFrame(data).head())
print(len(pd.DataFrame(data)))

In [None]:
issues_reference_df = pd.merge(
    filtered_references_df,
    pd.DataFrame(data),
    on='url'
)
print(len(issues_reference_df))

In [None]:
issues_reference_df.head(1)

In [None]:
non_filtered_count = len(issues_reference_df)
total_df = pd.merge(vulnerabilities_df, issues_reference_df, on='id')
total_df = total_df.drop('published_y', axis=1)
display(total_df.head(5))
non_na_count = len(total_df.dropna(subset='primary_cwe'))
print('non filtered issue count\t', non_filtered_count)
print('filtered issue count:\t', len(total_df))
print('non na issue count:\t', non_na_count)
print('percentage of filtered:\t', non_na_count / len(total_df))
print('percentage of non filtered:\t', non_na_count / non_filtered_count)

In [None]:
total_df.columns

In [None]:
total_df = total_df.drop(columns=['__index_level_0__'])
total_df.columns

In [None]:
total_df = total_df.drop(columns=['weaknesses', ])

In [None]:
d = {
    'id': 'cve_id',
    'published_x': 'cve_published',
    'descriptions': 'cve_descriptions',
    'metrics': 'cve_metrics',
    'references': 'cve_references',
    'configurations': 'cve_configurations',
    'cwe_list': 'cve_cwe_list',
    'primary_cwe': 'cve_primary_cwe',
    'tags': 'cve_tags',
    'owner_repo': 'issue_owner_repo',
    'body': 'issue_body',
    'title': 'issue_title',
    'comments_url': 'issue_comments_url',
    'comments_count': 'issue_comments_count',
    'created_at': 'issue_created_at',
    'updated_at': 'issue_updated_at',
    'html_url': 'issue_html_url',
    'github_id': 'issue_github_id',
    'number': 'issue_number'
}
total_df =total_df.rename(columns=d)
display(total_df.sample(1))
len(total_df)

In [None]:
total_df = total_df.dropna(subset='cve_primary_cwe')
display(total_df.columns)
len(total_df)

In [None]:
import datasets

dataset = datasets.Dataset.from_pandas(total_df)
#dataset = ds.remove_columns(['__index_level_0__'])
dataset.push_to_hub("Eathus/github-issues-references-max")

## All issues of top 50 repos (negative + positive dataset)

In [8]:
import requests
import os

counter = 0
USER_MAIN = os.getenv("GITHUB_USER_MAIN")
USER_PRIV = os.getenv("GITHUB_USER_PRIV")
USER_SCHOOL = os.getenv("GITHUB_USER_SCHOOL")

API_KEY_MAIN = os.getenv("GITHUB_API_KEY_MAIN")
API_KEY_PRIV = os.getenv("GITHUB_API_KEY_PRIV")
API_KEY_SCHOOL = os.getenv("GITHUB_API_KEY_SCHOOL")
counter = 0
dict_auth = [
    {
        'user': USER_MAIN,
        'secret': API_KEY_MAIN

    },
    {
        'user': USER_PRIV,
        'secret': API_KEY_PRIV
    },
    {
        'user': USER_SCHOOL,
        'secret': API_KEY_SCHOOL
    }
]
    
def get_issues(username, repo, ipp=100):
    
    
    tmpl = f"https://api.github.com/repos/{username}/{repo}/issues?state=all&page=1&per_page={ipp}"
    url = tmpl.format(username=username, repo=repo)
    return _getter(url)


def _getter(url):
    global  counter

    counter += 1
    link = dict(next=url)
    while 'next' in link:
        auth = (dict_auth[counter%3]['user'], dict_auth[counter%3]['secret'])
        response = requests.get(link['next'], auth=auth, timeout=100)
    
        if response.status_code != 200:
            print(f"Error for URL: {url}")
            print(f"Status Code: {response.status_code}")
            print(f"Response: {response.json()}")
            return  # Skip this URL
    
        '''
        if response.status_code != 200:
            raise IOError(
                "Non-200 status code %r; %r; %r" % (
                    response.status_code, url, response.json()))
        '''
        for result in response.json():
            yield result
    
        link = _link_field_to_dict(response.headers.get('link', None))

def _link_field_to_dict(field):

    if not field:
        return dict()

    return dict([
        (
            part.split('; ')[1][5:-1],
            part.split('; ')[0][1:-1],
        ) for part in field.split(', ')
    ])


In [9]:
import pickle
repos_lists = []
from tqdm.notebook import tqdm

file_path = "tmp/neg_issues.pkl"
save_freq = 10

if os.path.exists(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File size: {file_size} bytes")
    
    if file_size == 0:
        print("Error: File is empty.")
    else:
        try:
            with open(file_path, "rb") as file:
                issues = pickle.load(file)
            print("Data loaded successfully.")
        except (pickle.PickleError, EOFError) as e:
            print(f"Error loading the pickle file (corrupted or incomplete): {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
else:
    print(f"The file '{file_path}' does not exist. issues = empty dict")
    issues = {}

iter_set = repos_set.difference(issues.keys())
for i, (owner, repo) in enumerate(tqdm(iter_set, desc="Processing repos", position=0)):   
    if (owner, repo) in issues : 
        continue
    repo_issues = []
    for issue in tqdm(
        get_issues(owner, repo, 100),
        desc=f"Fetching {owner}/{repo} issues",
        position=1,
        leave=False  # Clears the inner bar when done
    ):
        if "pull_request" not in issue.keys():
            item = {
                "issue_owner_repo": (owner, repo),
                "issue_body": issue['body'],
                "issue_title":issue['title'],
                "issue_comments_url": issue['comments_url'],
                "issue_comments_count": issue['comments'],
                "issue_created_at": issue['created_at'],
                "issue_updated_at": issue["updated_at"],
                "issue_html_url": issue["html_url"],
                "issue_github_id": issue["id"],
                "issue_number": issue["number"],
            }
            repo_issues.append(item)
    issues[(owner, repo)] = repo_issues
    if (i + 1) % save_freq == 0  or  i == len(iter_set) - 1:
        with open(file_path, "wb") as file:  # 'wb' mode writes in binary format
            pickle.dump(issues, file,  protocol=pickle.HIGHEST_PROTOCOL)
            file.flush()  # Force write to disk
            os.fsync(file.fileno()) 
        
        tqdm.write(f"🔄 Auto-saved at {i + 1} repos", end="\r")
                
print("\n✅ Final save completed")
                
df = pd.DataFrame(sum(issues.values(), []))

File size: 6934251740 bytes
Data loaded successfully.


Processing repos: 0it [00:00, ?it/s]


✅ Final save completed


In [10]:
list(issues.items())[0]

(('ChurchCRM', 'CRM'),
 [{'issue_owner_repo': ('ChurchCRM', 'CRM'),
   'issue_body': "https://openj9-jenkins.osuosl.org/job/Test_openjdk22_j9_extended.openjdk_aarch64_linux_Personal/1\r\nserviceability_jvmti_j9_0\r\nserviceability_jvmti_j9_1\r\nserviceability/jvmti/thread/GetStackTrace/GetStackTraceAndRetransformTest/GetStackTraceAndRetransformTest.java\r\n```\r\n16:30:07  STDERR:\r\n16:30:07  java.lang.UnsatisfiedLinkError: jdk/test/whitebox/WhiteBox.registerNatives()V\r\n16:30:07  \tat jdk.test.whitebox.WhiteBox.<clinit>(WhiteBox.java:67)\r\n16:30:07  \tat GetStackTraceAndRetransformTest.main(GetStackTraceAndRetransformTest.java:76)\r\n16:30:07  \tat java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)\r\n16:30:07  \tat java.base/java.lang.reflect.Method.invoke(Method.java:586)\r\n16:30:07  \tat com.sun.javatest.regtest.agent.MainWrapper$MainTask.run(MainWrapper.java:138)\r\n16:30:07  \tat java.base/java.lang.Thread.run(Thread.java:158

In [None]:
df = pd.DataFrame(sum(issues.values(), []))
display(len(df))
display(df.head())

KeyboardInterrupt: 

In [None]:
import datasets

dataset = datasets.Dataset.from_pandas(df)

dataset.push_to_hub("Eathus/github-issues-negatives-max")

## Repair

In [None]:
import datasets
import pandas as pd
from ast import literal_eval  # Needed if issue_owner_repo is stored as string

dataset = datasets.load_dataset("Eathus/github-issues-negatives")

# Assuming it's in the default "train" split
df = dataset["train"].to_pandas()


In [None]:
df['issue_owner_repo'].head(3)

In [None]:


# 1. Load the dataset from Hugging Face Hub
dataset = datasets.load_dataset("Eathus/github-issues-negatives")

# Assuming it's in the default "train" split
df = dataset["train"].to_pandas()

# 2. Convert back to the original issues dictionary structure
issues = {}

# If issue_owner_repo was stored as string (like "(owner, repo)"), convert it back to tuple
# If it was properly stored as tuple, you can skip the literal_eval part
df['issue_owner_repo'] = df['issue_owner_repo'].apply(lambda arr: (arr[0], arr[1]))


In [None]:
df['issue_owner_repo'].head(5)

In [None]:

# Group by owner/repo tuple and convert each group back to the original item format
for (owner, repo), group in df.groupby('issue_owner_repo'):
    repo_issues = []
    for _, row in group.iterrows():
        item = {
            "issue_owner_repo": (owner, repo),
            "issue_body": issue['body'],
            "issue_title":issue['title'],
            "issue_comments_url": issue['comments_url'],
            "issue_comments_count": issue['comments'],
            "issue_created_at": issue['created_at'],
            "issue_updated_at": issue["updated_at"],
            "issue_html_url": issue["html_url"],
            "issue_github_id": issue["id"],
            "issue_number": issue["number"],
        }
        repo_issues.append(item)
    issues[(owner, repo)] = repo_issues


In [None]:
list(issues.items())[0]

In [None]:

# Now you can save it back to pickle if needed
file_path = "tmp/neg_issues.pkl"
with open(file_path, "wb") as file:
    pickle.dump(issues, file, protocol=pickle.HIGHEST_PROTOCOL)
    file.flush()
    os.fsync(file.fileno())