In [3]:
DIFF_HUNK_MAX_LINES = 1000
ADDED_EDITED_REMOVED_METHODS_MAX_CHARACTERS = 2000
MAX_COMMENTS = 40

In [7]:
import re
from typing import Tuple

method_pattern = re.compile(r"^\s*[\+-](public|private|protected|static|\s)*\s+\w+\s+\w+\(.*\)\s*\{")

added_methods = []
removed_methods = []

diff_hunk = """
 public int calculateSum(int a, int b) {
     return a + b;
 }
 public int subtract(int x, int y) {
-    int result = x - y;
+    int result = x - y - 2;  // Changed subtraction logic
     return result;
 }
+public void updateValue() {
     this.value = 0;
 }
-public void removeValue() {
-     this.value = 0;
- }
 public double divide(int num, int denom) {
-    return num / denom;
+    return num / (double) denom;  // Changed to floating-point division
 }
"""

for line in diff_hunk.split('\n'):
    if method_pattern.match(line):
        if line[0] == '+':
            added_methods.append(line[1:].strip())
        elif line[0] == '-':
            removed_methods.append(line[1:].strip())
            
# create a method that takes and returns string of added methods, split by newlines and a string of removed methods, split by newlines
def extract_added_and_removed_methods(diff_hunk: str) -> Tuple[str, str]:
    added_methods = []
    removed_methods = []

    for line in diff_hunk.split('\n'):
        if method_pattern.match(line):
            if line[0] == '+':
                added_methods.append(line[1:].strip())
            elif line[0] == '-':
                removed_methods.append(line[1:].strip())

    return '\n'.join(added_methods)[:ADDED_EDITED_REMOVED_METHODS_MAX_CHARACTERS], '\n'.join(removed_methods)[:ADDED_EDITED_REMOVED_METHODS_MAX_CHARACTERS]


# Test the function
added, removed = extract_added_and_removed_methods(diff_hunk)
print(added)
print('removed:')
print(removed)

public void updateValue() {
removed:
public void removeValue() {


In [8]:
print('Added methods:')
for method in added_methods:
    print(method)

Added methods:
public void updateValue() {


In [9]:
print('Removed methods:')
for method in removed_methods:
    print(method)

Removed methods:
public void removeValue() {


In [10]:
diff_hunk = """
 public int calculateSum(int a, int b) {
     return a + b;
 }
 public int subtract(int x, int y) {
-    int result = x - y;
+    int result = x - y - 2;  // Changed subtraction logic
     return result;
 }
+public void updateValue() {
     this.value = 0;
 }
 public double divide(int num, int denom) {
-    return num / denom;
+    return num / (double) denom;  // Changed to floating-point division
 }
"""

# Regex to identify method entry and method signatures
method_entry_pattern = re.compile(r"^\s*(public|private|protected|static|\s)*\s+\w+\s+\w+\(.*\)\s*\{")
method_signature_pattern = re.compile(r"^\s*(public|private|protected|static|\s)*\s+\w+\s+\w+\(.*\)")

# Store edited methods
edited_methods = set()

# Track if currently inside a method
inside_method = False
current_method = None

# Process each line in the diff hunk
for line in diff_hunk.split('\n'):
    if method_entry_pattern.search(line):
        inside_method = True
        current_method = method_signature_pattern.search(line).group()
    elif line.startswith('}'):
        inside_method = False
        current_method = None
    elif inside_method and (line.startswith('+') or line.startswith('-')):
        if current_method:
            edited_methods.add(current_method.strip())

# Print results
# print("Edited Methods:")
# for method in edited_methods:
#     print(method)

# create a method that takes a string and returns a string that contains the method names, split by newlines
def extract_edited_methods(diff_hunk: str) -> str:
    method_entry_pattern = re.compile(r"^\s*(public|private|protected|static|\s)*\s+\w+\s+\w+\(.*\)\s*\{")
    method_signature_pattern = re.compile(r"^\s*(public|private|protected|static|\s)*\s+\w+\s+\w+\(.*\)")

    edited_methods = set()
    inside_method = False
    current_method = None

    for line in diff_hunk.split('\n'):
        if method_entry_pattern.search(line):
            inside_method = True
            current_method = method_signature_pattern.search(line).group()
        elif line.startswith('}'):
            inside_method = False
            current_method = None
        elif inside_method and (line.startswith('+') or line.startswith('-')):
            if current_method:
                edited_methods.add(current_method.strip())

    # make sure not to put every character on a newline, max 50 lines
    return '\n'.join(edited_methods)[:ADDED_EDITED_REMOVED_METHODS_MAX_CHARACTERS]

# Test the function
edited_methods = extract_edited_methods(diff_hunk)
print("Edited Methods:")
print(edited_methods)


Edited Methods:
public double divide(int num, int denom)
public int subtract(int x, int y)


In [14]:
import json
import requests

headers = {
    'Accept-Encoding': 'base64',
}


def get_comments(gerrit_change_id):
    response = requests.get(f'https://git.eclipse.org/r/changes/{gerrit_change_id}/comments', headers=headers)
    file_names = json.loads(response.content[4:-1])

    comment_list = []

    # for each filename in comments
    for filename in file_names:
        # for each comment in the filename
        for comment in file_names[filename]:
            comment_dict = {
                "filename": filename,
                "message": comment["message"],
                "updated": comment["updated"],
                "commit_id": comment["commit_id"],
                "unresolved": comment["unresolved"],
            }
            comment_list.append(comment_dict)
    
    # return a max of 30 comments
    return comment_list[:MAX_COMMENTS]

cmts = get_comments(144554)
cmts
    

[{'filename': 'org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/correction/JavaCorrectionProcessor.java',
  'message': 'space',
  'updated': '2019-09-29 11:48:01.000000000',
  'commit_id': 'cd31e1e9f750e9ecacd65bf412c0d883d6aeead5',
  'unresolved': False},
 {'filename': 'org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/correction/UnresolvedElementsSubProcessor.java',
  'message': 'space',
  'updated': '2019-09-29 11:48:01.000000000',
  'commit_id': 'cd31e1e9f750e9ecacd65bf412c0d883d6aeead5',
  'unresolved': False},
 {'filename': 'org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/java/JavaCompletionProposalComputer.java',
  'message': 'space',
  'updated': '2019-09-29 11:48:01.000000000',
  'commit_id': 'cd31e1e9f750e9ecacd65bf412c0d883d6aeead5',
  'unresolved': False},
 {'filename': 'org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/java/hover/AbstractAnnotationHover.java',
  'message': 'space',
  'updated': '2019-09-29 11:48:01.000000000',
  'commit_id': 'cd31e1e

In [15]:
import base64

# extract the decoded_content for a given PR gerrit_change_id
def get_diff_hunk(gerrit_change_id):
    headers = {
        'Accept-Encoding': 'base64',
    }

    b64_response = requests.get(f'https://git.eclipse.org/r/changes/{gerrit_change_id}/revisions/current/patch', headers=headers)
    base_64_content = b64_response.content

    decoded_content = base64.b64decode(base_64_content)
    # return the first DIFF_HUNK_MAX_LINES lines of the decoded content as a string and not array
    return '\n'.join(decoded_content.decode('utf-8').split('\n')[:DIFF_HUNK_MAX_LINES])

diff_hunk = get_diff_hunk(144554)
print(diff_hunk)

From 34cd20ef9b83878cedf9b90eba0244997c08fe34 Mon Sep 17 00:00:00 2001
From: Carsten Hammer <carsten.hammer@t-online.de>
Date: Thu, 20 Jun 2019 19:30:52 +0200
Subject: [PATCH] Bug 548309 - Use jdk 5 for-each loop (org.eclipse.jdt.ui)

org.eclipse.jdt.internal.ui.text
Replace simple uses of Iterator with a corresponding for-loop. Also add
missing braces on loops as necessary.

Change-Id: Iae14ebc756f1463098c86edc0d4e93e0127fff75
Signed-off-by: Carsten Hammer <carsten.hammer@t-online.de>
---

diff --git a/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/AbstractInformationControl.java b/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/AbstractInformationControl.java
index 2c61c8b..c000b50 100644
--- a/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/AbstractInformationControl.java
+++ b/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/AbstractInformationControl.java
@@ -154,9 +154,11 @@
 				Object[] children= contentProvider instanceof ITreePathContentProvider
 	

In [16]:
# make a call to the following endpoint and get its response: https://git.eclipse.org/r/changes/144554/detail
import json

headers = {
    'Accept': 'application/json',
}

response = requests.get('https://git.eclipse.org/r/changes/144554/detail', headers=headers)
response.content

def get_gerrit_change_info(gerrit_change_id):
    headers = {
        'Accept': 'application/json',
    }

    response = requests.get(f'https://git.eclipse.org/r/changes/{gerrit_change_id}/detail', headers=headers)

    if response.status_code != 200:
        print(f"Failed to get details for change {gerrit_change_id}. Status code: {response.status_code}")
        return None

    comments = get_comments(gerrit_change_id)
    diff_hunk = get_diff_hunk(gerrit_change_id)
    added_methods, removed_methods = extract_added_and_removed_methods(diff_hunk)
    edited_methods = extract_edited_methods(diff_hunk)

    response_dict = json.loads(response.content[4:-1])

    return {
        "id": response_dict["id"],
        "gerrit_change_id": gerrit_change_id,
        "project": response_dict["project"],
        "subject": response_dict["subject"],
        "created": response_dict["created"],
        "updated": response_dict["updated"],
        "status": response_dict["status"],
        "insertions": response_dict["insertions"],
        "deletions": response_dict["deletions"],
        "total_comment_count": response_dict["total_comment_count"],
        "unresolved_comment_count": response_dict["unresolved_comment_count"],
        "messages": [
            {
                "real_author": {"name": message["real_author"]["name"], "username": message["real_author"]["username"]} if "real_author" in message else None,
                "date": message["date"],
                "message": message["message"],
                "_revision_number": message["_revision_number"],
            }
            for message in response_dict["messages"]
        ],
        "comments": comments,
        "added_methods": added_methods,
        "removed_methods": removed_methods,
        "edited_methods": edited_methods,
        "diff_hunk": diff_hunk
    }

# Test the function
gerrit_change_id = 144554
change_info = get_gerrit_change_info(gerrit_change_id)
print(change_info)



In [17]:
def find_sets(obj, path=None):
    if path is None:
        path = [] 

    if isinstance(obj, dict):
        for key, value in obj.items():
            find_sets(value, path + [key])
    
    elif isinstance(obj, (list, tuple)):
        for index, item in enumerate(obj):
            find_sets(item, path + [index])
    
    elif isinstance(obj, set):
        print("Found a set at path:", " -> ".join(map(str, path)))

# Test the function
find_sets(change_info)

In [23]:
import csv

def save_to_csv(change_info, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)

        # Write the header row
        writer.writerow([
            "id",
            "gerrit_change_id",
            "project",
            "subject",
            "created",
            "updated",
            "status",
            "insertions",
            "deletions",
            "total_comment_count",
            "unresolved_comment_count",
            "messages",
            "comments",
            "added_methods",
            "removed_methods",
            "edited_methods",
            "diff_hunk"
        ])

        # Write the data rows
        for change in change_info:
            writer.writerow([
                change["id"],
                change["gerrit_change_id"],
                change["project"],
                change["subject"],
                change["created"],
                change["updated"],
                change["status"],
                change["insertions"],
                change["deletions"],
                change["total_comment_count"],
                change["unresolved_comment_count"],
                change["messages"],
                change["comments"],
                change["added_methods"],
                change["removed_methods"],
                change["edited_methods"],
                change["diff_hunk"]
            ])

In [24]:
gerrit_change_id = 8351
change_info2 = get_gerrit_change_info(gerrit_change_id)
change_info2

{'id': 'mylyn%2Forg.eclipse.mylyn.tasks~master~Ia4b44d557afcda8ae30c8b165a28e4af10b4d47d',
 'gerrit_change_id': 8351,
 'project': 'mylyn/org.eclipse.mylyn.tasks',
 'subject': '392688: TaskAttribute support for Unset',
 'created': '2012-10-23 20:18:41.000000000',
 'updated': '2013-06-24 19:08:26.000000000',
 'status': 'MERGED',
 'insertions': 70,
 'deletions': 1,
 'total_comment_count': 14,
 'unresolved_comment_count': 0,
 'messages': [{'real_author': {'name': 'CI Bot', 'username': 'hudsonvoter'},
   'date': '2012-10-23 21:42:37.000000000',
   'message': 'Patch Set 1:\n\nBuild Started https://hudson.eclipse.org/sandbox/job/mylyn-tasks-gerrit/181/ ',
   '_revision_number': 1},
  {'real_author': {'name': 'David Green', 'username': 'dgreen'},
   'date': '2012-10-23 21:53:55.000000000',
   'message': 'Patch Set 1: (1 inline comment)\n\n',
   '_revision_number': 1},
  {'real_author': {'name': 'Miles Parker', 'username': 'mparker'},
   'date': '2012-10-23 21:59:24.000000000',
   'message': 'P

In [25]:
import sys
import csv

def read_from_csv(filename):
    csv.field_size_limit(sys.maxsize)
    with open(filename, 'r', newline='') as f:
        reader = csv.reader(f)
        next(reader)

        change_info_list = []
        for row in reader:
            change_info_list.append({
                "id": row[0],
                "gerrit_change_id": row[1],
                "project": row[2],
                "subject": row[3],
                "created": row[4],
                "updated": row[5],
                "status": row[6],
                "insertions": row[7],
                "deletions": row[8],
                "total_comment_count": row[9],
                "unresolved_comment_count": row[10],
                "messages": row[11],
                "comments": row[12],
                "added_methods": row[13],
                "removed_methods": row[14],
                "edited_methods": row[15],
                "diff_hunk": row[16]
            })

        return change_info_list

In [27]:
# get all distinct gerrit_change_id's from the gerrit_eclipse.csv, 

# Increase the field size limit
csv.field_size_limit(sys.maxsize)

def get_distinct_gerrit_change_ids(filename):
    with open(filename, 'r', encoding='utf-8', errors='ignore', newline='') as f:
        reader = csv.reader((line.replace('\0', '') for line in f))
        next(reader)

        distinct_ids = set()
        for row in reader:
            if row and row[2]:
                distinct_ids.add(row[2])

        return distinct_ids if distinct_ids else None
    
distinct_ids = get_distinct_gerrit_change_ids('gerrit_eclipse.csv')
distinct_ids

{'53165',
 '18841',
 '18308',
 '138003',
 '148490',
 '55360',
 '47546',
 '90489',
 '97212',
 '62764',
 '15376',
 '128057',
 '134288',
 '95599',
 '117039',
 '125754',
 '144213',
 '40317',
 '65998',
 '3796',
 '78835',
 '81366',
 '1930',
 '13289',
 '79396',
 '85332',
 '29446',
 '44157',
 '26530',
 '128683',
 '134824',
 '72147',
 '135791',
 '11673',
 '85250',
 '101571',
 '111095',
 '131561',
 '148707',
 '148738',
 '84295',
 '70615',
 '23160',
 '99786',
 '86320',
 '111911',
 '8196',
 '72759',
 '159699',
 '3280',
 '19824',
 '101345',
 '7633',
 '11527',
 '62347',
 '106008',
 '136111',
 '4179',
 '157664',
 '159808',
 '160406',
 '110344',
 '31958',
 '23145',
 '20812',
 '73902',
 '53998',
 '84477',
 '3790',
 '80271',
 '117186',
 '14417',
 '23432',
 '54987',
 '91190',
 '89631',
 '153906',
 '68559',
 '32019',
 '105612',
 '84446',
 '23339',
 '67764',
 '12819',
 '30856',
 '14446',
 '13564',
 '89422',
 '21817',
 '134577',
 '154300',
 '123468',
 '63934',
 '10945',
 '49944',
 '70840',
 '132638',
 '1356

In [28]:
# count number of entries in distinct_ids
len(distinct_ids)

14883

In [53]:
# get all the additional info for the dataset and save it into a csv
change_info_list = []

for i, distinct_id in enumerate(distinct_ids):
    # if an error occurs, print the error message and continue with the next distinct_id
    try:
        change_info = get_gerrit_change_info(distinct_id)
    except Exception as e:
        print(f"Error processing {distinct_id}: {e}")
        continue

    change_info_list.append(change_info)

    if(i % 100 == 0):
        print(f"Processed {i + 1} of {len(distinct_ids)} distinct IDs ({(i + 1) / len(distinct_ids) * 100:.2f}%)")

# remove all None entries from change_info_list
change_info_list = [change_info for change_info in change_info_list if change_info is not None]

# save to CSV
save_to_csv(change_info_list, 'dataset.csv')

Processed 1 of 14883 distinct IDs (0.01%)
Failed to get details for change 1224. Status code: 404
Failed to get details for change 31081. Status code: 404
Failed to get details for change 70507. Status code: 404
Error processing 13039: 'name'
Failed to get details for change 4966. Status code: 404
Error processing 98814: 'utf-8' codec can't decode byte 0xc9 in position 384107: invalid continuation byte
Failed to get details for change 104306. Status code: 404
Failed to get details for change 44864. Status code: 404
Failed to get details for change 1578. Status code: 404
Failed to get details for change 22456. Status code: 404
Failed to get details for change 2758. Status code: 404
Failed to get details for change 3484. Status code: 404
Failed to get details for change 8204. Status code: 404
Failed to get details for change 134218. Status code: 404
Error processing 84333: 'username'
Failed to get details for change 2755. Status code: 404
Failed to get details for change 3713. Status cod

In [54]:
len(change_info_list)

10663

In [55]:
change_info_list[10]

{'id': 'osee%2Forg.eclipse.osee~0.11.1~I283aa38761ad0678c6d2a36f646948cfe2bc0e1c',
 'gerrit_change_id': '13665',
 'project': 'osee/org.eclipse.osee',
 'subject': 'bug[ats_V5JWA]: Publish with Diff BLAM not working',
 'created': '2013-06-09 04:40:04.000000000',
 'updated': '2013-06-24 19:08:26.000000000',
 'status': 'MERGED',
 'insertions': 417,
 'deletions': 83,
 'total_comment_count': 8,
 'unresolved_comment_count': 0,
 'messages': [{'real_author': {'name': 'Marc Potter',
    'username': 'mpotterc0k'},
   'date': '2013-06-12 14:24:12.000000000',
   'message': "Patch Set 1: I would prefer that you didn't submit this\n\n(3 inline comments)\n\n",
   '_revision_number': 1},
  {'real_author': {'name': 'Megumi Telles', 'username': 'mtelles'},
   'date': '2013-06-17 16:45:57.000000000',
   'message': "Patch Set 1: I would prefer that you didn't submit this; IP review completed\n\n(5 inline comments)\n\n",
   '_revision_number': 1},
  {'real_author': {'name': 'Mark Joy', 'username': 'mjoy'},
