## Repository selection
We want to list and select repositories high number of CVEs references to them.   

In [None]:
import pandas as pd
from  datasets import load_dataset

ds = load_dataset('Eathus/cve-references-list', split='train')
references_df = ds.to_pandas()

ds = load_dataset('Eathus/filtered-vulnerabilities', split='train')
vulnerabilities_df = ds.to_pandas()

In [None]:
ref_df = references_df[references_df['url'].str.contains('github.*issues')]
#ref_df = ref_df[pd.to_datetime(ref_df.published) > pd.to_datetime('2023-05-01')]

In [None]:
import re 
from tqdm import tqdm

repos = set()

for id, url in tqdm(ref_df[['id','url']].itertuples(index=False)):
    
    pattern = r'https://github.com/([^/]+)/([^/]+)/issues/'
    match = re.match(pattern, url)
    if match:
        owner = match.group(1)
        repo = match.group(2)
        repos.add((owner, repo, id))
        
print(len(repos))

In [None]:
df = pd.DataFrame(repos, columns=['owner', 'repo_name', 'cve'])
df_group = df.groupby(['owner', 'repo_name']).count()
df_group_sorted = df_group.sort_values(by='cve', ascending=False)
print(sum(df_group_sorted.cve.tolist()))
top_50_groups = df_group_sorted.head(50)
top_50_groups.head(50)
print(sum(top_50_groups.cve.tolist()))
display(df_group_sorted.head(50))

In [None]:
repos_set = set(df_group_sorted.index)
display(repos_set)

## Select all the issues/pull form  CVEs references that are linked to one of those repos 

In [None]:
filtered_references_df_tmp = references_df[
    references_df['url'].str.contains('github.com/[^/]+/[^/]+/issues/[0-9]+', regex=True)].copy()


def funct(url):
    pattern = r'https://github.com/([^/]+)/([^/]+)/'
    match = re.match(pattern, url)
    if match:
        owner = match.group(1)
        repo = match.group(2)
        return (owner, repo)
    return None


filtered_references_df_tmp['owner_repo'] = filtered_references_df_tmp['url'].map(funct)
filtered_references_df = filtered_references_df_tmp[filtered_references_df_tmp.owner_repo.isin(repos_set)]
filtered_references_df.sample(5)

In [None]:
print('filtered_references_df len:\t', len(filtered_references_df))

## Scrape issues from GitHub referenced Issues

In [None]:
import requests
import os
USER_MAIN = os.getenv("GITHUB_USER_MAIN")
USER_PRIV = os.getenv("GITHUB_USER_PRIV")
USER_SCHOOL = os.getenv("GITHUB_USER_SCHOOL")

API_KEY_MAIN = os.getenv("GITHUB_API_KEY_MAIN")
API_KEY_PRIV = os.getenv("GITHUB_API_KEY_PRIV")
API_KEY_SCHOOL = os.getenv("GITHUB_API_KEY_SCHOOL")
counter = 0
dict_auth = [
    {
        'user': USER_MAIN,
        'secret': API_KEY_MAIN

    },
    {
        'user': USER_PRIV,
        'secret': API_KEY_PRIV
    },
    {
        'user': USER_SCHOOL,
        'secret': API_KEY_SCHOOL
    }
]


def get_issue(url_html):
    
    url = url_html.replace("github.com", "api.github.com/repos")
    return _getter(url, url_html)


def _getter(url, html_url):
    global counter

    counter += 1
    auth = (dict_auth[counter % 3]['user'], dict_auth[counter % 3]['secret'])
    response = requests.get(url, auth=auth)

    if response.status_code != 200:
        print(f"Error for URL: {url}")
        print(f"Status Code: {response.status_code}")
        print(f"Response: {response.json()}")
        return  # Skip this URL
    
    res = response.json()
    try:
        yield {
            "url": html_url,
            "body": res['body'],
            "title": res['title'],
            "comments_url": res['comments_url'],
            "comments_count": res['comments'],
            "created_at": res['created_at'],
            "updated_at": res["updated_at"],
            "html_url": res["html_url"],
            "github_id": res["id"],
            "number": res["number"]
    
        }
    except Exception as e:
        print(url)
        print(f"Exception: {e}")


In [None]:
data = []

for index, row in tqdm(filtered_references_df.iterrows()):
    for item in get_issue(row['url']):
        data.append(item)

In [None]:
display(pd.DataFrame(data).head())
print(len(pd.DataFrame(data)))

In [None]:
issues_reference_df = pd.merge(
    filtered_references_df,
    pd.DataFrame(data),
    on='url'
)
print(len(issues_reference_df))

In [None]:
issues_reference_df.head(1)

In [None]:
non_filtered_count = len(issues_reference_df)
total_df = pd.merge(vulnerabilities_df, issues_reference_df, on='id')
total_df = total_df.drop('published_y', axis=1)
display(total_df.head(5))
non_na_count = len(total_df.dropna(subset='primary_cwe'))
print('non filtered issue count\t', non_filtered_count)
print('filtered issue count:\t', len(total_df))
print('non na issue count:\t', non_na_count)
print('percentage of filtered:\t', non_na_count / len(total_df))
print('percentage of non filtered:\t', non_na_count / non_filtered_count)

In [None]:
total_df.columns

In [None]:
total_df = total_df.drop(columns=['__index_level_0__'])
total_df.columns

In [None]:
total_df = total_df.drop(columns=['weaknesses', ])

In [None]:
d = {
    'id': 'cve_id',
    'published_x': 'cve_published',
    'descriptions': 'cve_descriptions',
    'metrics': 'cve_metrics',
    'references': 'cve_references',
    'configurations': 'cve_configurations',
    'cwe_list': 'cve_cwe_list',
    'primary_cwe': 'cve_primary_cwe',
    'tags': 'cve_tags',
    'owner_repo': 'issue_owner_repo',
    'body': 'issue_body',
    'title': 'issue_title',
    'comments_url': 'issue_comments_url',
    'comments_count': 'issue_comments_count',
    'created_at': 'issue_created_at',
    'updated_at': 'issue_updated_at',
    'html_url': 'issue_html_url',
    'github_id': 'issue_github_id',
    'number': 'issue_number'
}
total_df =total_df.rename(columns=d)
display(total_df.sample(1))
len(total_df)

In [None]:
total_df = total_df.dropna(subset='cve_primary_cwe')
display(total_df.columns)
len(total_df)

In [None]:
import datasets

dataset = datasets.Dataset.from_pandas(total_df)
#dataset = ds.remove_columns(['__index_level_0__'])
dataset.push_to_hub("Eathus/github-issues-references-max")

## All issues of top 50 repos (negative + positive dataset)

In [None]:
import requests
import os

counter = 0
USER_MAIN = os.getenv("GITHUB_USER_MAIN")
USER_PRIV = os.getenv("GITHUB_USER_PRIV")
USER_SCHOOL = os.getenv("GITHUB_USER_SCHOOL")

API_KEY_MAIN = os.getenv("GITHUB_API_KEY_MAIN")
API_KEY_PRIV = os.getenv("GITHUB_API_KEY_PRIV")
API_KEY_SCHOOL = os.getenv("GITHUB_API_KEY_SCHOOL")
counter = 0
dict_auth = [
    {
        'user': USER_MAIN,
        'secret': API_KEY_MAIN

    },
    {
        'user': USER_PRIV,
        'secret': API_KEY_PRIV
    },
    {
        'user': USER_SCHOOL,
        'secret': API_KEY_SCHOOL
    }
]
    
def get_issues(username, repo, ipp=100):
    
    
    tmpl = f"https://api.github.com/repos/{username}/{repo}/issues?state=all&page=1&per_page={ipp}"
    url = tmpl.format(username=username, repo=repo)
    return _getter(url)


def _getter(url):
    global  counter

    counter += 1
    link = dict(next=url)
    while 'next' in link:
        auth = (dict_auth[counter%3]['user'], dict_auth[counter%3]['secret'])
        response = requests.get(link['next'], auth=auth, timeout=100)
    
        if response.status_code != 200:
            print(f"Error for URL: {url}")
            print(f"Status Code: {response.status_code}")
            print(f"Response: {response.json()}")
            return  # Skip this URL
    
        '''
        if response.status_code != 200:
            raise IOError(
                "Non-200 status code %r; %r; %r" % (
                    response.status_code, url, response.json()))
        '''
        for result in response.json():
            yield result
    
        link = _link_field_to_dict(response.headers.get('link', None))

def _link_field_to_dict(field):

    if not field:
        return dict()

    return dict([
        (
            part.split('; ')[1][5:-1],
            part.split('; ')[0][1:-1],
        ) for part in field.split(', ')
    ])


In [None]:
import pickle
repos_lists = []
from tqdm.notebook import tqdm

file_path = "tmp/neg_issues.pkl"
save_freq = 10

if os.path.exists(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File size: {file_size} bytes")
    
    if file_size == 0:
        print("Error: File is empty.")
    else:
        try:
            with open(file_path, "rb") as file:
                issues = pickle.load(file)
            print("Data loaded successfully.")
        except (pickle.PickleError, EOFError) as e:
            print(f"Error loading the pickle file (corrupted or incomplete): {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
else:
    print(f"The file '{file_path}' does not exist. issues = empty dict")
    issues = {}

iter_set = repos_set.difference(issues.keys())
for i, (owner, repo) in enumerate(tqdm(iter_set, desc="Processing repos", position=0)):   
    if (owner, repo) in issues : 
        continue
    repo_issues = []
    for issue in tqdm(
        get_issues(owner, repo, 100),
        desc=f"Fetching {owner}/{repo} issues",
        position=1,
        leave=False  # Clears the inner bar when done
    ):
        if "pull_request" not in issue.keys():
            item = {
                "issue_owner_repo": (owner, repo),
                "issue_body": issue['body'],
                "issue_title":issue['title'],
                "issue_comments_url": issue['comments_url'],
                "issue_comments_count": issue['comments'],
                "issue_created_at": issue['created_at'],
                "issue_updated_at": issue["updated_at"],
                "issue_html_url": issue["html_url"],
                "issue_github_id": issue["id"],
                "issue_number": issue["number"],
            }
            repo_issues.append(item)
    issues[(owner, repo)] = repo_issues
    if (i + 1) % save_freq == 0  or  i == len(iter_set) - 1:
        with open(file_path, "wb") as file:  # 'wb' mode writes in binary format
            pickle.dump(issues, file,  protocol=pickle.HIGHEST_PROTOCOL)
            file.flush()  # Force write to disk
            os.fsync(file.fileno()) 
        
        tqdm.write(f"🔄 Auto-saved at {i + 1} repos", end="\r")
                
print("\n✅ Final save completed")
                
df = pd.DataFrame(sum(issues.values(), []))

In [None]:
list(issues.items())[0]

In [None]:
df = pd.DataFrame(sum(issues.values(), []))
display(len(df))
display(df.head())

In [1]:
import os
import pickle
import pandas as pd

file_path = "tmp/neg_issues.pkl"
if os.path.exists(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File size: {file_size} bytes")
    
    if file_size == 0:
        print("Error: File is empty.")
    else:
        try:
            with open(file_path, "rb") as file:
                issues = pickle.load(file)
            print("Data loaded successfully.")
        except (pickle.PickleError, EOFError) as e:
            print(f"Error loading the pickle file (corrupted or incomplete): {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

df = pd.DataFrame(sum(issues.values(), []))

File size: 6934251740 bytes
Data loaded successfully.


In [2]:
import gc
issues.clear()
gc.collect()

0

In [3]:
chunk_size = 100_000  # Adjust based on your memory
chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

In [4]:
from datasets import Dataset, DatasetDict, concatenate_datasets

partial_datasets = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")
    dataset_chunk = Dataset.from_pandas(chunk, preserve_index=False)
    partial_datasets.append(dataset_chunk)

Processing chunk 1/36
Processing chunk 2/36
Processing chunk 3/36
Processing chunk 4/36
Processing chunk 5/36
Processing chunk 6/36
Processing chunk 7/36
Processing chunk 8/36
Processing chunk 9/36
Processing chunk 10/36
Processing chunk 11/36
Processing chunk 12/36
Processing chunk 13/36
Processing chunk 14/36
Processing chunk 15/36
Processing chunk 16/36
Processing chunk 17/36
Processing chunk 18/36
Processing chunk 19/36
Processing chunk 20/36
Processing chunk 21/36
Processing chunk 22/36
Processing chunk 23/36
Processing chunk 24/36
Processing chunk 25/36
Processing chunk 26/36
Processing chunk 27/36
Processing chunk 28/36
Processing chunk 29/36
Processing chunk 30/36
Processing chunk 31/36
Processing chunk 32/36
Processing chunk 33/36
Processing chunk 34/36
Processing chunk 35/36
Processing chunk 36/36


In [5]:
full_dataset = concatenate_datasets(partial_datasets)

In [6]:
full_dataset.push_to_hub("Eathus/github-issues-negatives-max")

Uploading the dataset shards:   0%|          | 0/15 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/240 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Eathus/github-issues-negatives-max/commit/c61218277e21ca4b8167e47ba39b4dbb816ff349', commit_message='Upload dataset', commit_description='', oid='c61218277e21ca4b8167e47ba39b4dbb816ff349', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Eathus/github-issues-negatives-max', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Eathus/github-issues-negatives-max'), pr_revision=None, pr_num=None)

In [None]:
import datasets
import gc

dataset = datasets.Dataset.from_pandas(df)
del df
gc.collect()

In [None]:
import datasets

dataset = datasets.Dataset.from_pandas(df)

dataset.push_to_hub("Eathus/github-issues-negatives-max")

## Repair

In [None]:
import datasets
import pandas as pd
from ast import literal_eval  # Needed if issue_owner_repo is stored as string

dataset = datasets.load_dataset("Eathus/github-issues-negatives")

# Assuming it's in the default "train" split
df = dataset["train"].to_pandas()


In [None]:
df['issue_owner_repo'].head(3)

In [None]:


# 1. Load the dataset from Hugging Face Hub
dataset = datasets.load_dataset("Eathus/github-issues-negatives")

# Assuming it's in the default "train" split
df = dataset["train"].to_pandas()

# 2. Convert back to the original issues dictionary structure
issues = {}

# If issue_owner_repo was stored as string (like "(owner, repo)"), convert it back to tuple
# If it was properly stored as tuple, you can skip the literal_eval part
df['issue_owner_repo'] = df['issue_owner_repo'].apply(lambda arr: (arr[0], arr[1]))


In [None]:
df['issue_owner_repo'].head(5)

In [None]:

# Group by owner/repo tuple and convert each group back to the original item format
for (owner, repo), group in df.groupby('issue_owner_repo'):
    repo_issues = []
    for _, row in group.iterrows():
        item = {
            "issue_owner_repo": (owner, repo),
            "issue_body": issue['body'],
            "issue_title":issue['title'],
            "issue_comments_url": issue['comments_url'],
            "issue_comments_count": issue['comments'],
            "issue_created_at": issue['created_at'],
            "issue_updated_at": issue["updated_at"],
            "issue_html_url": issue["html_url"],
            "issue_github_id": issue["id"],
            "issue_number": issue["number"],
        }
        repo_issues.append(item)
    issues[(owner, repo)] = repo_issues


In [None]:
list(issues.items())[0]

In [None]:

# Now you can save it back to pickle if needed
file_path = "tmp/neg_issues.pkl"
with open(file_path, "wb") as file:
    pickle.dump(issues, file, protocol=pickle.HIGHEST_PROTOCOL)
    file.flush()
    os.fsync(file.fileno())