In [19]:
import re

import pandas as pd

from issue import Issue
from tqdm import tqdm

In [20]:
DATA_FILE_PATH = "vscode_issues.csv.gzip" # "../teste.csv.gzip" # "../vscode_issues_SA.csv.gzip"

sample_dataset = pd.read_csv(DATA_FILE_PATH, compression='gzip', lineterminator='\n')

issues_dict_list = sample_dataset.to_dict('records')

issues = []
test_dataset = []

# To recode the number of times each author was assigned as an Assignee.
times_as_assignee = {}

for issue_dict in tqdm(issues_dict_list):
    try:
        new_issue = Issue.from_dict(issue_dict)
        issues.append(new_issue)

        # Later the dictionary will be used to filter out authors who have been assigned as an Assignee too few times.
        author = new_issue.assignee
        if author:  # Ensure author is not None
            if times_as_assignee.get(author) is None:
                times_as_assignee[author] = 1
            else:
                times_as_assignee[author] += 1
    except Exception as e:
        pass

100%|██████████| 184819/184819 [00:18<00:00, 9808.60it/s] 


In [21]:
print(len(issues))
print(len(times_as_assignee))
print(times_as_assignee)

184812
116
{'mjbvz': 15630, 'rzhao271': 1423, 'lszomoru': 3903, 'roblourens': 9410, 'alexr00': 3423, 'sandy081': 8016, 'aiday-mar': 832, 'meganrogge': 5018, 'deepak1556': 4121, 'Tyriar': 12408, 'jrieken': 9073, 'bpasero': 11117, 'joshspicer': 42, 'justschen': 389, 'benibenj': 601, 'andreamah': 1163, 'alexdima': 7050, 'ulugbekna': 431, 'hediet': 2520, 'chrmarti': 1819, 'joaomoreno': 9579, 'connor4312': 2725, 'lramos15': 1832, 'TylerLeonhardt': 2089, 'karthiknadig': 25, 'rebornix': 5550, 'bhavyaus': 406, 'aeschli': 7067, 'Yoyokrazy': 302, 'eleanorjboyd': 32, 'amunger': 323, 'anthonykim1': 10, 'sbatten': 2510, 'joyceerhl': 479, 'isidorn': 7503, 'daviddossett': 338, 'DonJayamanne': 280, 'dbaeumer': 1879, 'bamurtaugh': 10, 'ntrogh': 3, 'chrisdias': 293, 'kieferrm': 314, 'hbons': 86, 'karrtikr': 15, 'cwebster-99': 1, 'digitarald': 79, 'brettcannon': 10, 'MeghanKulkarni': 17, 'paulacamargo25': 7, 'gregvanl': 102, 'danyeh': 122, 'esonnino': 7, 'csigs': 5, 'Chuxel': 6, 'weinand': 2712, 'egamma'

In [22]:
def apply_steps_to_dataset(processing_funcs, dataset):
	"""
	Iterates over each issue in the dataset and applies the provided list of pre_processing functions in the given order.

	Each function must return the altered issue, unless they are
	supposed to be filtered out, in which case the function 
	must return None.
	"""
	
	new_issues = []

	for issue in dataset:
		for func in processing_funcs:
			issue = func(issue)
			if issue is None:
				break
		if issue is not None:
			new_issues.append(issue)
	
	print("New dataset has " + str(len(new_issues)) + " issues")

	return new_issues

### Pre-Processing Steps

In [23]:
def filter_test_dataset(issue):
	"""
	Checks the issue id. If it is in the range of the test set (210000 < id <= 220000),
	return the issue. Otherwise, return None.
	"""
	issue_id = int(issue.identifier)
	if 210000 < issue_id <= 220000:
		return issue
	return None

In [24]:
def filter_main_training_dataset(issue):
	"""
	Checks the issue id. If it is in the range of the larger training set (id <= 210000),
	return the issue. Otherwise, return None.
	"""
	issue_id = int(issue.identifier)
	if issue_id <= 210000:
		return issue
	return None

In [25]:
def filter_recent_issues_training_dataset(issue):
	"""
	Checks the issue id. If it is in the range of the training set which only contains
	recent issues (190000<= id <= 210000), return the issue. Otherwise, return None.
	"""
	issue_id = int(issue.identifier)
	if 190000 <= issue_id <= 210000:
		return issue
	return None

In [26]:
def filter_basic_trainingset_requirements(issue):
	"""
	Checks if a given issue corresponds to the basic requirements for the
	training set are met. These are vscode's issues that 
		(i) are closed; 
		(ii) have exactly one assignee;
	"""
	# Check if the issue is closed
	if issue.completion_time is None:
		return None
	
	# Check if the issue has exactly one assignee
	if issue.assignee is None or (isinstance(issue.assignee, list) and len(issue.assignee) != 1):
		return None
	
	# If both conditions are met, return the issue
	return issue

In [27]:
def filter_unfrequent_assignees(issue):
    """
    Filters out issues from authors who was as an assignee too few times(lower than the threshold).
    """
    threshold = 1
    
    author = issue.assignee
    
    # Check if the number of times each author as assignee is above 1
    if author is None or times_as_assignee.get(author, 0) <= threshold:
        return None
    
    return issue

In [28]:
def clean_issue_title(issue):
	"""
	Cleans the issue field of the given issue.
	"""
	new_title = issue.summary

	# Remove mention to other issues
	new_title = re.sub(r"\[?\s*[Ff]ollow up to #?[\d]+\s*\]?", "", new_title)

	# Remove monospacing markdown formatting
	new_title = re.sub(r"`([\s\S]*?)`", r"\1", new_title)

	# Update the issue summary
	issue.summary = new_title
	
	# Return the updated issue object
	return issue


In [29]:
# Be cautious when changing these constants.
# They must match those used in the training notebook (.ipynb).
CODE_BEGIN_SENTINEL = "<BoC>"
CODE_END_SENTINEL = "<EoC>"

def clean_issue_body(issue):
    """
    Cleans and preprocesses the body field of the given issue.

    This function performs the following operations:
    1. Wraps code fragments within sentinel tokens to help the model recognize code blocks.
    2. Removes unnecessary formatting elements like headers, emphasis, markdown links,
       and HTML tags.
    3. Preserves code fragments in their original position within the issue body,
       surrounding them with the predefined sentinel tokens for consistency with training.
    """

    # Access the body content of the issue
    issue_body = issue.body

    # If the issue body is None or empty, return an empty string as the cleaned content
    if issue_body is None:
        return ""

    # Make a copy of the original body for processing
    new_body = issue_body

    # Preserve code fragments in their original position with sentinel tokens.
    # Replace each code block (```) with sentinel-wrapped content
    new_body = re.sub(r"```([\s\S]*?)```",
                      lambda match: CODE_BEGIN_SENTINEL + match.group(1) + CODE_END_SENTINEL,
                      new_body)

    # Remove headers (lines starting with one or more # characters)
    new_body = re.sub("#+ ", "", new_body)

    # Remove emphasis formatting (italics and bold) by replacing underscores, asterisks, and backticks
    # surrounding text while preserving the inner text content.
    # Note: Ensure code fragments are wrapped in sentinels before this step, as these substitutions
    # could interfere with the original code formatting.
    new_body = re.sub(r"_([\s\S]*?)_", r"\1", new_body)
    new_body = re.sub(r"\*([\s\S]*?)\*", r"\1", new_body)
    new_body = re.sub(r"`(\s[\S]*?)`", r"\1", new_body)

    # Remove HTML tags, keeping only the text content
    new_body = re.sub(r"<[\s\S]*?>", r"", new_body)

    # Remove markdown-style links and images, keeping only the text description if available
    new_body = re.sub(r"\!?$begin:math:display$[\\s\\S]+$end:math:display$$begin:math:text$[\\S]+$end:math:text$", "", new_body)

    # Remove any URL attachments, such as external links or images
    new_body = re.sub(r"https?://[\S]+", "", new_body)

    # Clean up excessive blank lines, condensing multiple newlines to a single newline
    new_body = re.sub(r"[\s]*\n+", "\n", new_body)

    # Assign the cleaned and processed content back to the issue body
    issue.body = new_body

    # Return the modified issue object with the updated body
    return issue


## Pre-Processing the Issue Datasets

In [30]:
clean_dataset = apply_steps_to_dataset([filter_basic_trainingset_requirements,\
										filter_unfrequent_assignees,\
										clean_issue_title,\
										clean_issue_body],issues)

main_training_dataset = apply_steps_to_dataset([filter_main_training_dataset],clean_dataset)
recent_issues_training_dataset = apply_steps_to_dataset([filter_recent_issues_training_dataset],clean_dataset)
test_dataset = apply_steps_to_dataset([filter_test_dataset],clean_dataset)

New dataset has 142391 issues
New dataset has 135926 issues
New dataset has 10299 issues
New dataset has 3135 issues


### Saving the results to a new file

In [31]:
def save_issue_repo(new_path,issue_repo):
	issues_as_dicts = []
	print("Parsing collected issues.\nThis might take a few minutes")

	for issue in tqdm(issue_repo):
		# print(vars(issue))
	
		issues_as_dicts.append(issue.to_dict())

	issues_as_dataset = pd.DataFrame.from_dict(issues_as_dicts)
	issues_as_dataset.to_csv(new_path, compression='gzip', index=False)

In [32]:
MAIN_TRAINING_DESTINATION_PATH = "train_A.csv.gzip"
RECENT_TRAINING_DESTINATION_PATH = "train_B.csv.gzip"
TEST_DESTINATION_PATH = "test.csv.gzip"

save_issue_repo(MAIN_TRAINING_DESTINATION_PATH,main_training_dataset)
save_issue_repo(RECENT_TRAINING_DESTINATION_PATH,recent_issues_training_dataset)
save_issue_repo(TEST_DESTINATION_PATH,test_dataset)


Parsing collected issues.
This might take a few minutes


100%|██████████| 135926/135926 [00:00<00:00, 3065810.02it/s]


Parsing collected issues.
This might take a few minutes


100%|██████████| 10299/10299 [00:00<00:00, 3116226.87it/s]


Parsing collected issues.
This might take a few minutes


100%|██████████| 3135/3135 [00:00<00:00, 2475831.87it/s]
