In [53]:
import re

import pandas as pd

from issue import Issue
from tqdm import tqdm

In [54]:
DATA_FILE_PATH = "vscode_no_prs.csv.gzip" # "../teste.csv.gzip" # "../vscode_issues_SA.csv.gzip"

sample_dataset = pd.read_csv(DATA_FILE_PATH, compression='gzip', lineterminator='\n')

issues_dict_list = sample_dataset.to_dict('records')

issues = []
test_dataset = []

# Initialize commit_no_by_author dictionary
commit_no_by_author = {}

for issue_dict in tqdm(issues_dict_list):
    try:
        new_issue = Issue.from_dict(issue_dict)
        issues.append(new_issue)

        # Move author-related logic inside the try block with correct indentation
        author = new_issue.assignee
        if author:  # Ensure author is not None
            if commit_no_by_author.get(author) is None:
                commit_no_by_author[author] = 1
            else:
                commit_no_by_author[author] += 1
    except Exception as e:
        pass

100%|██████████| 184819/184819 [00:18<00:00, 9744.06it/s] 


In [55]:
print(len(issues))
print(len(commit_no_by_author))
print(commit_no_by_author)

184812
116
{'mjbvz': 15630, 'rzhao271': 1423, 'lszomoru': 3903, 'roblourens': 9410, 'alexr00': 3423, 'sandy081': 8016, 'aiday-mar': 832, 'meganrogge': 5018, 'deepak1556': 4121, 'Tyriar': 12408, 'jrieken': 9073, 'bpasero': 11117, 'joshspicer': 42, 'justschen': 389, 'benibenj': 601, 'andreamah': 1163, 'alexdima': 7050, 'ulugbekna': 431, 'hediet': 2520, 'chrmarti': 1819, 'joaomoreno': 9579, 'connor4312': 2725, 'lramos15': 1832, 'TylerLeonhardt': 2089, 'karthiknadig': 25, 'rebornix': 5550, 'bhavyaus': 406, 'aeschli': 7067, 'Yoyokrazy': 302, 'eleanorjboyd': 32, 'amunger': 323, 'anthonykim1': 10, 'sbatten': 2510, 'joyceerhl': 479, 'isidorn': 7503, 'daviddossett': 338, 'DonJayamanne': 280, 'dbaeumer': 1879, 'bamurtaugh': 10, 'ntrogh': 3, 'chrisdias': 293, 'kieferrm': 314, 'hbons': 86, 'karrtikr': 15, 'cwebster-99': 1, 'digitarald': 79, 'brettcannon': 10, 'MeghanKulkarni': 17, 'paulacamargo25': 7, 'gregvanl': 102, 'danyeh': 122, 'esonnino': 7, 'csigs': 5, 'Chuxel': 6, 'weinand': 2712, 'egamma'

In [56]:
def apply_steps_to_dataset(processing_funcs, dataset):
	"""
	Given a list of preocessing functions and a dataset (list of issues), 
	applies each function to the dataset in the order given.

	Each function must return the altered issue, unless they are
	supposed to be filtered out, in which case the function 
	must return None.
	"""
	
	new_issues = []

	for issue in dataset:
		for func in processing_funcs:
			issue = func(issue)
			if issue is None:
				break
		if issue is not None:
			new_issues.append(issue)
	
	print("New dataset has " + str(len(new_issues)) + " issues")

	return new_issues

### Pre-Processing Steps

In [57]:
def filter_test_dataset(issue):
	"""
	Checks the issue id. If it is in the range of the test set (210000 < id <= 220000),
	return the issue. Otherwise, return None.
	"""
	issue_id = int(issue.identifier)
	if 210000 < issue_id <= 220000:
		return issue
	return None

In [58]:
def filter_main_training_dataset(issue):
	"""
	Checks the issue id. If it is in the range of the larger training set (id <= 210000),
	return the issue. Otherwise, return None.
	"""
	issue_id = int(issue.identifier)
	if issue_id <= 210000:
		return issue
	return None

In [59]:
def filter_recent_issues_training_dataset(issue):
	"""
	Checks the issue id. If it is in the range of the training set which only contains
	recent issues (190000<= id <= 210000), return the issue. Otherwise, return None.
	"""
	issue_id = int(issue.identifier)
	if 190000 <= issue_id <= 210000:
		return issue
	return None

In [60]:
def filter_basic_trainingset_requirements(issue):
	"""
	Checks if a given issue corresponds to the basic requirements for the
	training set are met. These are vscode's issues that 
		(i) are closed; 
		(ii) have exactly one assignee;
	"""
	# Check if the issue is closed
	if issue.completion_time is None:
		return None
	
	# Check if the issue has exactly one assignee
	if issue.assignee is None or (isinstance(issue.assignee, list) and len(issue.assignee) != 1):
		return None
	
	# If both conditions are met, return the issue
	return issue

In [61]:
def filter_unfrequent_commiters(issue):
    """
    Filters out issues from authors with commit counts less than or equal to the threshold.
    """
    threshold = 1  # Set the threshold for filtering commit counts
    
    author = issue.assignee
    
    # Check if the author exists and their commit count is above the threshold
    if author is None or commit_no_by_author.get(author, 0) <= threshold:
        return None
    
    return issue

In [62]:
def clean_issue_title(issue):
	"""
	Cleans the issue field of the given issue.
	"""
	new_title = issue.summary

	# Remove mention to other issues
	new_title = re.sub(r"\[?\s*[Ff]ollow up to #?[\d]+\s*\]?", "", new_title)

	# Remove monospacing markdown formatting
	new_title = re.sub(r"`([\s\S]*?)`", r"\1", new_title)

	# Update the issue summary
	issue.summary = new_title
	
	# Return the updated issue object
	return issue


In [63]:
## BE CAREFUL IF ALTERING THESE CONSTANTS.
# They should be the same used for training the model in the training notbook (.ipynb)
CODE_BEGIN_SENTINEL = "<BoC>"
CODE_END_SENTINEL = "<EoC>"

def clean_issue_body(issue):
	"""
	Cleans the body field of the given issue.
	Additionally, envolves code fragments using the sentinel tokens
	from the training notebook.
	"""

	issue_body = issue.body

	if issue_body is None:
		return ""

	code_fragments = ""

	new_body = issue_body

	#  TODO: Dont just isolate all fragments, but preserve
	# their place in the issue body, surrounding them with 
	# the sentinel tokens

	## Note code fragments
	for match in re.findall(r"```([\s\S]*?)```", new_body):
		code_fragments += CODE_BEGIN_SENTINEL + match + CODE_END_SENTINEL + "\n"

	new_body = re.sub(r"```([\s\S]*?)```", "", new_body)

	# Remove headers
	new_body = re.sub("#+ ", "", new_body)

	# Removing emphasis might interfere with code fragments
	# Watchout if you want to fill the TODO, as these lines
	# will have to be taken care of

	new_body = re.sub(r"_([\s\S]*?)_", r"\1", new_body)
	new_body = re.sub(r"\*([\s\S]*?)\*", r"\1", new_body)
	new_body = re.sub(r"`(\s[\S]*?)`", r"\1", new_body)

	# Remove html tags
	new_body = re.sub(r"<[\s\S]*?>", r"", new_body)

	# Remove markdown links
	new_body = re.sub(r"\!?\[[\s\S]+\]\([\S]+\)", "", new_body)

	# Remove attachments
	new_body = re.sub(r"https?://[\S]+", "", new_body)

	new_body = re.sub(r"[\s]*\n+", "\n", new_body)

	new_body = code_fragments + new_body
    
	issue.body = new_body

	# Return the modified issue object
	return issue


## Pre-Processing the Issue Datasets

In [64]:
clean_dataset = apply_steps_to_dataset([filter_basic_trainingset_requirements,\
										filter_unfrequent_commiters,\
										clean_issue_title,\
										clean_issue_body],issues)

main_training_dataset = apply_steps_to_dataset([filter_main_training_dataset],clean_dataset)
recent_issues_training_dataset = apply_steps_to_dataset([filter_recent_issues_training_dataset],clean_dataset)
test_dataset = apply_steps_to_dataset([filter_test_dataset],clean_dataset)

New dataset has 142391 issues
New dataset has 135926 issues
New dataset has 10299 issues
New dataset has 3135 issues


### Saving the results to a new file

In [65]:
def save_issue_repo(new_path,issue_repo):
	issues_as_dicts = []
	print("Parsing collected issues.\nThis might take a few minutes")

	for issue in tqdm(issue_repo):
		# print(vars(issue))
	
		issues_as_dicts.append(issue.to_dict())

	issues_as_dataset = pd.DataFrame.from_dict(issues_as_dicts)
	issues_as_dataset.to_csv(new_path, compression='gzip', index=False)

In [67]:
MAIN_TRAINING_DESTINATION_PATH = "train_A.csv.gzip"
RECENT_TRAINING_DESTINATION_PATH = "train_B.csv.gzip"
TEST_DESTINATION_PATH = "test.csv.gzip"

save_issue_repo(MAIN_TRAINING_DESTINATION_PATH,main_training_dataset)
save_issue_repo(RECENT_TRAINING_DESTINATION_PATH,recent_issues_training_dataset)
save_issue_repo(TEST_DESTINATION_PATH,test_dataset)


Parsing collected issues.
This might take a few minutes


100%|██████████| 135926/135926 [00:00<00:00, 3421771.07it/s]


Parsing collected issues.
This might take a few minutes


100%|██████████| 10299/10299 [00:00<00:00, 4049605.03it/s]


Parsing collected issues.
This might take a few minutes


100%|██████████| 3135/3135 [00:00<00:00, 3279088.04it/s]
