# Redo graph.pkl

In [1]:
from src.inspector_git import IGLogReader, GitLogDTO
from src.jira_miner.models import JsonFileFormatJira
from src.github_miner import JsonFileFormatGithub
import json
from pathlib import Path
from pprint import pprint
import re
from datetime import datetime
import pickle
from graph import *

# JSON
path_jira = "/home/alex/Work/BachelorThesis/Vortex/test-input/jira-miner/ZEPPELIN-detailed-issues.json"
path_github = "/home/alex/Work/BachelorThesis/Vortex/test-input/github-miner/githubProject.json"
# IGLOG
path_inspector_git = "/home/alex/Work/BachelorThesis/Vortex/test-input/inspector-git/zeppelin.iglog"



def load_from_json(model_cls, file_path: str):
    """Load JSON from a given path and validate it with the provided model class."""
    file_path = Path(file_path)

    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    with file_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    return model_cls.model_validate(data)

def load_jsons():
    jira_data_loaded = load_from_json(JsonFileFormatJira, path_jira)
    github_data_loaded = load_from_json(JsonFileFormatGithub, path_github)
    return jira_data_loaded, github_data_loaded

IGLogReader = IGLogReader()



jira_data, github_data = load_jsons()
inspector_git_data = IGLogReader.read(path_inspector_git)
graph = Graph()

def add_inspector_git_data(graph:Graph, inspector_git_data: GitLogDTO):
    git_date_format = "%a %b %d %H:%M:%S %Y %z"

    for commitDTO in inspector_git_data.commits:
        commit = GitCommit(
            sha=commitDTO.id,
            message=commitDTO.message,
            author_date=datetime.strptime(commitDTO.author_date, git_date_format),
            committer_date=datetime.strptime(commitDTO.committer_date, git_date_format)
        )
        graph.add_commit(commit)

        author = GitUser(email=commitDTO.author_email, name=commitDTO.author_name)
        committer = GitUser(email=commitDTO.committer_email, name=commitDTO.committer_name)
        graph.add_user_git(author)
        graph.add_user_git(committer)

        author_edge = GitCommitGitUserEdge(commit=commit, git_user=author, role="author")
        committer_edge = GitCommitGitUserEdge(commit=commit, git_user=committer, role="committer")
        graph.add_edge(author_edge)
        graph.add_edge(committer_edge)

        for change in commitDTO.changes:
            file = File(path = change.new_file_name)
            graph.add_file(file = file , old_name = change.old_file_name)

            file_commit_edge = GitCommitFileEdge(commit=commit, file=file)
            file_writer_edge = GitUserFileEdge(git_user=committer, file=file, role="writer")
            graph.add_edge(file_commit_edge)
            graph.add_edge(file_writer_edge)

            if author.email != committer.email:
                file_reviewer_edge = GitUserFileEdge(git_user=author, file=file, role="reviewer")
                graph.add_edge(file_reviewer_edge)

add_inspector_git_data(graph, inspector_git_data)

def add_jira_data(graph:Graph, jira_data: JsonFileFormatJira):
    def add_issue_statuses():
        for status in jira_data.issueStatuses:
            category = IssueStatusCategory(
                key=status.statusCategory.key,
                name=status.statusCategory.name
            )
            status = IssueStatus(
                id=status.id,
                name=status.name,
            )

            graph.add_issue_status_category(category)
            graph.add_issue_status(status)

            edge = IssueStatusIssueStatusCategoryEdge(
                issue_status=status,
                issue_status_category=category,
            )
            graph.add_edge(edge)

    def add_issue_types():
        for issue_type in jira_data.issueTypes:
            issue_type = IssueType(
                id=issue_type.id,
                name=issue_type.name,
                description=issue_type.description,
                isSubTask=issue_type.isSubTask,
            )
            graph.add_issue_type(issue_type)

    def add_users():
        for user in jira_data.users:
            jira_user = JiraUser(
                key=user.key,
                name=user.name,
                link=user.self_,
            )
            graph.add_jira_user(jira_user)

    def add_issues():
        for issue in jira_data.issues:
            i = Issue(
                id=issue.id,
                key=issue.key,
                summary=issue.summary,
                createdAt=issue.created,
                updatedAt=issue.updated,
            )
            graph.add_issue(i)

            issue_status = graph.get_issue_status(issue.status.id)
            issue_issue_status_edge = IssueIssueStatusEdge(
                issue_status=issue_status,
                issue=i
            )
            graph.add_edge(issue_issue_status_edge)

            issue_type = graph.get_issue_type(issue.issueType)
            issue_issue_type_edge = IssueIssueTypeEdge(
                issue_type=issue_type,
                issue=i
            )
            graph.add_edge(issue_issue_type_edge)

            reporter = graph.get_jira_user(issue.reporterId)
            reporter_edge = IssueJiraUserEdge(
                jira_user=reporter,
                issue=i,
                role= "reporter"
            )
            graph.add_edge(reporter_edge)
            if issue.creatorId is not None:
                creator = graph.get_jira_user(issue.creatorId)
                creator_edge = IssueJiraUserEdge(
                jira_user=creator,
                issue=i,
                role= "creator"
                )
                graph.add_edge(creator_edge)

            if issue.assigneeId is not None:
                assignee = graph.get_jira_user(issue.assigneeId)
                assignee_edge = IssueJiraUserEdge(
                    jira_user=assignee,
                    issue=i,
                    role= "assignee"
                )
                graph.add_edge(assignee_edge)

    def add_edge_if_absent(graph, edge: IssueIssueEdge):
        """Add an IssueIssueEdge only if it doesn't already exist."""
        existing_edges = graph.adjacency.get(edge.child.dict_key(), {}).get("issues", [])
        if not any(
            isinstance(e, IssueIssueEdge) and e.normalized_key() == edge.normalized_key()
            for e in existing_edges
        ):
            graph.add_edge(edge)

    def make_issue_parent_connections():
        for jira_issue in jira_data.issues:
            current_issue = graph.get_issue(jira_issue.key)

            # 🔹 Connect to parent
            if jira_issue.parent is not None:
                parent_issue = graph.get_issue(jira_issue.parent)
                edge = IssueIssueEdge(child=current_issue, parent=parent_issue)
                add_edge_if_absent(graph, edge)

            # 🔹 Connect to subtasks
            for subtask_id in jira_issue.subTasks or []:
                child_issue = graph.get_issue(subtask_id)
                edge = IssueIssueEdge(child=child_issue, parent=current_issue)
                add_edge_if_absent(graph, edge)


    add_issue_statuses()
    add_issue_types()
    add_users()
    add_issues()
    make_issue_parent_connections()

add_jira_data(graph, jira_data)

def add_github_data(graph:Graph, github_data: JsonFileFormatGithub):
    a = 0
    cre = 0
    m = 0
    for pr in github_data.pullRequests:
        # ADD PR
        pull_request = PullRequest(
            number=pr.number,
            title=pr.title,
            state=pr.state,
            changedFiles=pr.changedFiles,
            createdAt=pr.createdAt,
            updatedAt=pr.updatedAt,
            body=pr.body,
            mergedAt=pr.mergedAt,
            closedAt=pr.closedAt,
        )
        graph.add_pull_request(pull_request)

        # ADD ALL USERS
        for assignee in pr.assignees:
            assignee_git_hub_user = GitHubUser(
                url=assignee.url,
                login=assignee.login,
                name=assignee.name,
            )
            graph.add_git_hub_user(assignee_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= assignee_git_hub_user,
                role = "assignee"
            )
            graph.add_edge(edge)
            a += 1

        if pr.createdBy:
            creator_git_hub_user = GitHubUser(
                url=pr.createdBy.url,
                login=pr.createdBy.login,
                name=pr.createdBy.name,
            )
            graph.add_git_hub_user(creator_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= creator_git_hub_user,
                role = "creator"
            )
            graph.add_edge(edge)
            cre += 1

        if pr.mergedBy:
            merger_git_hub_user = GitHubUser(
                name=pr.mergedBy.name,
                url=pr.mergedBy.url,
                login=pr.mergedBy.login,
            )
            graph.add_git_hub_user(merger_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= merger_git_hub_user,
                role = "merger"
            )
            graph.add_edge(edge)
            m += 1

        # ADD ALL COMMITS
        for c in pr.commits:
            commit = GitHubCommit(
                sha=c.sha,
                date=c.date,
                message=c.message,
                changedFiles=c.changedFiles,
            )
            graph.add_git_hub_commit(commit)

            edge = PullRequestGitHubCommitEdge(
                commit = commit,
                pr = pull_request,
            )
            graph.add_edge(edge)

add_github_data(graph, github_data)

def save_pickle(obj, var_name: str, base_dir: str = "pickle_data") -> Path:
    """
    Save a Python object using pickle in a directory relative to the notebook.

    Args:
        obj: The Python object to save.
        var_name: Name of the variable (used as filename).
        base_dir: Relative directory to save pickles (default: "pickle_data").

    Returns:
        Path to the saved pickle file.
    """
    # Create the directory if it doesn't exist
    save_dir = Path(base_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # Build full path
    pickle_path = save_dir / f"{var_name}.pkl"

    # Save object
    with open(pickle_path, "wb") as f:
        pickle.dump(obj, f)

    print(f"Saved {var_name} to {pickle_path}")
    return pickle_path

# Save your loaded data
save_pickle(graph, "graph")

Saved graph to pickle_data/graph.pkl


PosixPath('pickle_data/graph.pkl')

# Imports

In [1]:
from src.inspector_git import IGLogReader, GitLogDTO
from src.jira_miner.models import JsonFileFormatJira
from src.github_miner import JsonFileFormatGithub
import json
from pathlib import Path
from pprint import pprint
import re
from datetime import datetime
import pickle
from graph import *

In [2]:
def load_pickle(pickle_file: str):
    with open(pickle_file, "rb") as f:
        return pickle.load(f)

graph = load_pickle("pickle_data/graph.pkl")

# Quick check
print("Graph data type:", type(graph))
print(graph)

Graph data type: <class 'graph.Graph'>
~~~~ Graph summary ~~~~
commits: 5512
git_users: 602
files: 5644

issue_statuses: 48
issue_types: 7
issue_status_categories: 3
jira_users: 2008
issues: 6202

pull_requests: 5022
git_hub_users: 690
git_hub_commits: 17869

edges: 177967


# Load data from SERIALIZED formats

In [2]:
# JSON
path_jira = "/home/vortex/Work/BachelorThesis/Vortex/test-input/jira-miner/ZEPPELIN-detailed-issues.json"
path_github = "/home/vortex/Work/BachelorThesis/Vortex/test-input/github-miner/githubProject.json"
# IGLOG
path_inspector_git = "/home/vortex/Work/BachelorThesis/Vortex/test-input/inspector-git/zeppelin.iglog"



def load_from_json(model_cls, file_path: str):
    """Load JSON from a given path and validate it with the provided model class."""
    file_path = Path(file_path)

    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    with file_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    return model_cls.model_validate(data)

def load_jsons():
    jira_data_loaded = load_from_json(JsonFileFormatJira, path_jira)
    github_data_loaded = load_from_json(JsonFileFormatGithub, path_github)
    return jira_data_loaded, github_data_loaded

IGLogReader = IGLogReader()



jira_data, github_data = load_jsons()
inspector_git_data = IGLogReader.read(path_inspector_git)
graph = Graph()

# Add data from inspector Git

In [3]:
def add_inspector_git_data(graph:Graph, inspector_git_data: GitLogDTO):
    git_date_format = "%a %b %d %H:%M:%S %Y %z"

    for commitDTO in inspector_git_data.commits:
        commit = GitCommit(
            sha=commitDTO.id,
            message=commitDTO.message,
            author_date=datetime.strptime(commitDTO.author_date, git_date_format),
            committer_date=datetime.strptime(commitDTO.committer_date, git_date_format)
        )
        graph.add_commit(commit)

        author = GitUser(email=commitDTO.author_email, name=commitDTO.author_name)
        committer = GitUser(email=commitDTO.committer_email, name=commitDTO.committer_name)
        graph.add_user_git(author)
        graph.add_user_git(committer)

        author_edge = GitCommitGitUserEdge(commit=commit, git_user=author, role="author")
        committer_edge = GitCommitGitUserEdge(commit=commit, git_user=committer, role="committer")
        graph.add_edge(author_edge)
        graph.add_edge(committer_edge)

        for change in commitDTO.changes:
            file = File(path = change.new_file_name)
            graph.add_file(file = file , old_name = change.old_file_name)

            file_commit_edge = GitCommitFileEdge(commit=commit, file=file)
            file_writer_edge = GitUserFileEdge(git_user=committer, file=file, role="writer")
            graph.add_edge(file_commit_edge)
            graph.add_edge(file_writer_edge)

            if author.email != committer.email:
                file_reviewer_edge = GitUserFileEdge(git_user=author, file=file, role="reviewer")
                graph.add_edge(file_reviewer_edge)

add_inspector_git_data(graph, inspector_git_data)

print(graph)

~~~~ Graph summary ~~~~
commits: 5512
git_users: 602
files: 5644

issue_statuses: 0
issue_types: 0
issue_status_categories: 0
jira_users: 0
issues: 0

pull_requests: 0
git_hub_users: 0
git_hub_commits: 0

edges: 100670


# Adding data from Jira to the graph

In [4]:
def add_jira_data(graph:Graph, jira_data: JsonFileFormatJira):
    def add_issue_statuses():
        for status in jira_data.issueStatuses:
            category = IssueStatusCategory(
                key=status.statusCategory.key,
                name=status.statusCategory.name
            )
            status = IssueStatus(
                id=status.id,
                name=status.name,
            )

            graph.add_issue_status_category(category)
            graph.add_issue_status(status)

            edge = IssueStatusIssueStatusCategoryEdge(
                issue_status=status,
                issue_status_category=category,
            )
            graph.add_edge(edge)

    def add_issue_types():
        for issue_type in jira_data.issueTypes:
            issue_type = IssueType(
                id=issue_type.id,
                name=issue_type.name,
                description=issue_type.description,
                isSubTask=issue_type.isSubTask,
            )
            graph.add_issue_type(issue_type)

    def add_users():
        for user in jira_data.users:
            jira_user = JiraUser(
                key=user.key,
                name=user.name,
                link=user.self_,
            )
            graph.add_jira_user(jira_user)

    def add_issues():
        for issue in jira_data.issues:
            i = Issue(
                id=issue.id,
                key=issue.key,
                summary=issue.summary,
                createdAt=issue.created,
                updatedAt=issue.updated,
            )
            graph.add_issue(i)

            issue_status = graph.get_issue_status(issue.status.id)
            issue_issue_status_edge = IssueIssueStatusEdge(
                issue_status=issue_status,
                issue=i
            )
            graph.add_edge(issue_issue_status_edge)

            issue_type = graph.get_issue_type(issue.issueType)
            issue_issue_type_edge = IssueIssueTypeEdge(
                issue_type=issue_type,
                issue=i
            )
            graph.add_edge(issue_issue_type_edge)

            reporter = graph.get_jira_user(issue.reporterId)
            reporter_edge = IssueJiraUserEdge(
                jira_user=reporter,
                issue=i,
                role= "reporter"
            )
            graph.add_edge(reporter_edge)
            if issue.creatorId is not None:
                creator = graph.get_jira_user(issue.creatorId)
                creator_edge = IssueJiraUserEdge(
                jira_user=creator,
                issue=i,
                role= "creator"
                )
                graph.add_edge(creator_edge)

            if issue.assigneeId is not None:
                assignee = graph.get_jira_user(issue.assigneeId)
                assignee_edge = IssueJiraUserEdge(
                    jira_user=assignee,
                    issue=i,
                    role= "assignee"
                )
                graph.add_edge(assignee_edge)

    def add_edge_if_absent(graph, edge: IssueIssueEdge):
        """Add an IssueIssueEdge only if it doesn't already exist."""
        existing_edges = graph.adjacency.get(edge.child.dict_key(), {}).get("issues", [])
        if not any(
            isinstance(e, IssueIssueEdge) and e.normalized_key() == edge.normalized_key()
            for e in existing_edges
        ):
            graph.add_edge(edge)

    def make_issue_parent_connections():
        for jira_issue in jira_data.issues:
            current_issue = graph.get_issue(jira_issue.key)

            # 🔹 Connect to parent
            if jira_issue.parent is not None:
                parent_issue = graph.get_issue(jira_issue.parent)
                edge = IssueIssueEdge(child=current_issue, parent=parent_issue)
                add_edge_if_absent(graph, edge)

            # 🔹 Connect to subtasks
            for subtask_id in jira_issue.subTasks or []:
                child_issue = graph.get_issue(subtask_id)
                edge = IssueIssueEdge(child=child_issue, parent=current_issue)
                add_edge_if_absent(graph, edge)


    add_issue_statuses()
    add_issue_types()
    add_users()
    add_issues()
    make_issue_parent_connections()

add_jira_data(graph, jira_data)

print(graph)

~~~~ Graph summary ~~~~
commits: 5512
git_users: 602
files: 5644

issue_statuses: 48
issue_types: 7
issue_status_categories: 3
jira_users: 2008
issues: 6202

pull_requests: 0
git_hub_users: 0
git_hub_commits: 0

edges: 129581


# Add data from GitHub

In [5]:
def add_github_data(graph:Graph, github_data: JsonFileFormatGithub):
    a = 0
    cre = 0
    m = 0
    for pr in github_data.pullRequests:
        # ADD PR
        pull_request = PullRequest(
            number=pr.number,
            title=pr.title,
            state=pr.state,
            changedFiles=pr.changedFiles,
            createdAt=pr.createdAt,
            updatedAt=pr.updatedAt,
            body=pr.body,
            mergedAt=pr.mergedAt,
            closedAt=pr.closedAt,
        )
        graph.add_pull_request(pull_request)

        # ADD ALL USERS
        for assignee in pr.assignees:
            assignee_git_hub_user = GitHubUser(
                url=assignee.url,
                login=assignee.login,
                name=assignee.name,
            )
            graph.add_git_hub_user(assignee_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= assignee_git_hub_user,
                role = "assignee"
            )
            graph.add_edge(edge)
            a += 1

        if pr.createdBy:
            creator_git_hub_user = GitHubUser(
                url=pr.createdBy.url,
                login=pr.createdBy.login,
                name=pr.createdBy.name,
            )
            graph.add_git_hub_user(creator_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= creator_git_hub_user,
                role = "creator"
            )
            graph.add_edge(edge)
            cre += 1

        if pr.mergedBy:
            merger_git_hub_user = GitHubUser(
                name=pr.mergedBy.name,
                url=pr.mergedBy.url,
                login=pr.mergedBy.login,
            )
            graph.add_git_hub_user(merger_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= merger_git_hub_user,
                role = "merger"
            )
            graph.add_edge(edge)
            m += 1

        # ADD ALL COMMITS
        for c in pr.commits:
            commit = GitHubCommit(
                sha=c.sha,
                date=c.date,
                message=c.message,
                changedFiles=c.changedFiles,
            )
            graph.add_git_hub_commit(commit)

            edge = PullRequestGitHubCommitEdge(
                commit = commit,
                pr = pull_request,
            )
            graph.add_edge(edge)

    print("assignees:", a)
    print("creators:", cre)
    print("mergers:", m)
    print("\n\n")

add_github_data(graph, github_data)

print(graph)

assignees: 60
creators: 5003
mergers: 488



~~~~ Graph summary ~~~~
commits: 5512
git_users: 602
files: 5644

issue_statuses: 48
issue_types: 7
issue_status_categories: 3
jira_users: 2008
issues: 6202

pull_requests: 5022
git_hub_users: 690
git_hub_commits: 17869

edges: 155428


# Save graph with pickle

In [3]:
def save_pickle(obj, var_name: str, base_dir: str = "pickle_data") -> Path:
    """
    Save a Python object using pickle in a directory relative to the notebook.

    Args:
        obj: The Python object to save.
        var_name: Name of the variable (used as filename).
        base_dir: Relative directory to save pickles (default: "pickle_data").

    Returns:
        Path to the saved pickle file.
    """
    # Create the directory if it doesn't exist
    save_dir = Path(base_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # Build full path
    pickle_path = save_dir / f"{var_name}.pkl"

    # Save object
    with open(pickle_path, "wb") as f:
        pickle.dump(obj, f)

    print(f"Saved {var_name} to {pickle_path}")
    return pickle_path

# Save your loaded data
save_pickle(graph, "graph")

Saved graph_safe to pickle_data/graph_safe.pkl


PosixPath('pickle_data/graph_safe.pkl')

# Load graph

# Make graph fully connected

In [2]:
from datetime import timedelta
from difflib import SequenceMatcher
import bisect

def link_issues_with_git_commits(graph: Graph):
    # Build one regex to match all issue keys
    issue_keys = [re.escape(issue.key) for issue in graph.issues.values()]
    issue_pattern = re.compile(r'\b(' + '|'.join(issue_keys) + r')\b', re.IGNORECASE)

    links = 0
    commits_liked_with_issues = 0
    for commit in graph.commits.values():
        if not commit.message:
            continue

        matches = issue_pattern.findall(commit.message)

        if len(matches) > 0:
            commits_liked_with_issues += 1
        for match in set(matches):
            issue = graph.get_issue(match.upper())
            edge = GitCommitIssueEdge(
                git_commit = commit,
                issue = issue,
            )
            graph.add_edge(edge)
            links += 1

    print(f"There are {links} Issue–Commit edges")
    print(f"Commits liked with issues: {commits_liked_with_issues}")

def link_pull_request_with_issue(graph: Graph):
    # Build one regex for all issue keys
    issue_keys = [re.escape(issue.key) for issue in graph.issues.values()]
    issue_pattern = re.compile(r'\b(' + '|'.join(issue_keys) + r')\b', re.IGNORECASE)

    links = 0
    prs_with_issues = 0

    for pr in graph.pull_requests.values():
        text = (pr.title or "") + " " + (pr.body or "")
        matches = issue_pattern.findall(text)

        if matches:
            prs_with_issues += 1
        for match in set(matches):
            issue = graph.get_issue(match.upper())
            if issue:
                edge = PullRequestIssueEdge(
                    pr=pr,
                    issue=issue,
                )
                graph.add_edge(edge)
                links += 1

    print(f"Created {links} PR–Issue edges")
    print(f"PRs linked with issues: {prs_with_issues}")

# Pre-sort commits by author_date
all_commits = sorted(graph.commits.values(), key=lambda c: c.author_date)
commit_dates = [c.author_date for c in all_commits]

def candidates_in_window(target_date, days=14):
    lo = bisect.bisect_left(commit_dates, target_date - timedelta(days=days))
    hi = bisect.bisect_right(commit_dates, target_date + timedelta(days=days))
    return all_commits[lo:hi]

def clean(msg: str) -> str:
    if not msg:
        return ""
    msg = msg.strip().lower()

    # remove PR references like "(#123)" or "[#123]"
    msg = re.sub(r'\(#\d+\)', '', msg)
    msg = re.sub(r'\[#\d+\]', '', msg)

    # remove GitHub squash markers like "squash commit", "merge pull request"
    msg = re.sub(r'merge pull request.*', '', msg)
    msg = re.sub(r'squash commit.*', '', msg)

    # collapse multiple spaces
    msg = re.sub(r'\s+', ' ', msg)

    return msg.strip()

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def link_pull_requests_with_git_commits(graph: Graph, min_similarity = 0.85):
    counts = {
        "merged_as": 0,
        "contains_commit": 0,
        "linked_via_issue": 0
    }
    for pr in graph.pull_requests.values():
        #TODO change with graph traversal functions
        pr_commits = [c.commit for c in graph.adjacency.get(pr.dict_key(), {}).get("git_hub_commits", [])]

        for pr_commit in pr_commits:
            # 1. Exact SHA match
            if git_commit := graph.commits.get(f"GitCommit:{pr_commit.sha}"):
                graph.add_edge(GitCommitPullRequestEdge(
                    git_commit=git_commit,
                    pr=pr,
                    relation="merged_as"
                ))
                counts["merged_as"] += 1
                continue

            # 2 Fuzzy match within date window
            candidates = candidates_in_window(pr_commit.date)
            best_candidate = None
            best_score = 0.0

            for candidate in candidates:
                score = similar(clean(pr_commit.message), clean(candidate.message))
                if score > min_similarity and score > best_score:
                    best_score = score
                    best_candidate = candidate

            if best_candidate:
                graph.add_edge(GitCommitPullRequestEdge(
                    git_commit=best_candidate,
                    pr=pr,
                    relation="contains_commit",
                ))
                counts["contains_commit"] += 1

        # 3. Issue-based linking (fallback)
        issues_linked_to_pr = [
            edge.issue for edge in graph.adjacency.get(pr.dict_key(), {}).get("issues", [])
        ]
        for issue in issues_linked_to_pr:
            git_commits_for_issue = [
                edge.git_commit for edge in graph.adjacency.get(issue.dict_key(), {}).get("git_commits", [])
            ]

            existing_edges = set()

            for git_commit in git_commits_for_issue:
                edge = GitCommitPullRequestEdge(
                    git_commit=git_commit,
                    pr=pr,
                    relation="linked_via_issue"
                )
                if edge not in existing_edges:
                    graph.add_edge(edge)
                    counts["linked_via_issue"] += 1
                    existing_edges.add(edge)

    for key in counts.keys():
        print(f"{key}: {counts[key]}")




def save_pickle(obj, var_name: str, base_dir: str = "pickle_data") -> Path:
    """
    Save a Python object using pickle in a directory relative to the notebook.

    Args:
        obj: The Python object to save.
        var_name: Name of the variable (used as filename).
        base_dir: Relative directory to save pickles (default: "pickle_data").

    Returns:
        Path to the saved pickle file.
    """
    # Create the directory if it doesn't exist
    save_dir = Path(base_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # Build full path
    pickle_path = save_dir / f"{var_name}.pkl"

    # Save object
    with open(pickle_path, "wb") as f:
        pickle.dump(obj, f)

    print(f"Saved {var_name} to {pickle_path}")
    return pickle_path



link_issues_with_git_commits(graph)
link_pull_request_with_issue(graph)
# 8 min 33 sec
link_pull_requests_with_git_commits(graph)

save_pickle(graph, "graph")

There are 3642 Issue–Commit edges
Commits liked with issues: 3209
Created 4385 PR–Issue edges
PRs linked with issues: 3974
merged_as: 916
contains_commit: 2414
linked_via_issue: 11182
Saved graph to pickle_data/graph.pkl


PosixPath('pickle_data/graph.pkl')

# ChatGPT web-browser

In [3]:
# Commit with most modified files

most_modified = max(
    graph.commits.values(),
    key=lambda c: len(graph.adjacency.get(c.dict_key(), {}).get("files", []))
)
print(most_modified.sha, most_modified.message)

1c19ce2f393ca15e431e372b10e44163caa72d08 Merge pull request #87 from Leemoonsoo/master
Rename zeppelin-web2 -> zeppelin-web


In [4]:
# Most seen files changes for issues of type "Bug"

from collections import Counter

bug_type = graph.get_issue_type("Bug")

file_counter = Counter()

for issue in graph.issues.values():
    if bug_type and any(
        isinstance(e, IssueIssueTypeEdge) and e.issue_type.id == bug_type.id
        for e in graph.adjacency.get(issue.dict_key(), {}).get("issue_types", [])
    ):
        # get all commits linked to this issue
        for edge in graph.adjacency.get(issue.dict_key(), {}).get("git_commits", []):
            commit = edge.git_commit
            # get all files linked to this commit
            for file_edge in graph.adjacency.get(commit.dict_key(), {}).get("files", []):
                file_counter[file_edge.file.path] += 1

# print the top 5 most frequent files
for file, count in file_counter.most_common(5):
    print(f"{file}: seen in {count} bug issues")


/dev/null: seen in 111 bug issues
zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java: seen in 97 bug issues
zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js: seen in 88 bug issues
zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java: seen in 70 bug issues
pom.xml: seen in 66 bug issues


In [5]:
from collections import Counter

# 1. Find the bug type node (by name)
bug_types = [
    it for it in graph.issue_types.values()
    if it.name == "Bug"
]


# 2. Define traversal steps:
#    IssueType → Issues → Commits → Files
steps = [
    ("issues", None),                      # IssueType → Issues
    ("git_commits", None),                 # Issues → Commits
    ("files", None),                       # Commits → Files
]

# 3. Traverse
all_files = graph.filtered_traversal(
    start_nodes=bug_types,
    steps=steps,
)

# 4. Count by file path
file_counter = Counter(f.path for f in all_files)

# 5. Show top 5
for file, count in file_counter.most_common(5):
    print(f"{file}: seen in {count} bug issues")


/dev/null: seen in 111 bug issues
zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java: seen in 97 bug issues
zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js: seen in 88 bug issues
zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java: seen in 70 bug issues
pom.xml: seen in 66 bug issues


# Ai queries in code

In [3]:
from collections import defaultdict

# 1) Find the IssueType node for "Bug"
bug_types = []
get_type = getattr(graph, "get_issue_type", None)
if callable(get_type):
    bug_type = get_type("Bug")
    if bug_type:
        bug_types = [bug_type]
else:
    bug_types = [it for it in graph.issue_types.values() if it.name.lower() == "bug"]

if not bug_types:
    print("No 'Bug' issue type found.")
else:
    # 2) Traverse: IssueType -> Issues -> Commits
    bug_commits = graph.filtered_traversal(
        start_nodes=bug_types,
        steps=[
            ("issues", None),        # IssueType -> Issues
            ("git_commits", None),   # Issues -> GitCommits
        ],
    )

    # 3) Deduplicate commits by SHA
    unique_commits = {c.sha: c for c in bug_commits}

    # 4) For each commit, collect files it touched
    file_to_commits = defaultdict(set)  # file path -> set of commit SHAs
    for commit in unique_commits.values():
        for edge in graph.adjacency.get(commit.dict_key(), {}).get("files", []):
            file_node = edge.other_node(commit.dict_key())  # neighbor is the File node
            file_to_commits[file_node.path].add(commit.sha)

    # 5) Filter files with > 10 unique commits linked to Bug issues
    files_gt_10 = {
        path: shas for path, shas in file_to_commits.items()
        if len(shas) > 10
    }

    # Optional: print sorted by number of commits descending
    for path, shas in sorted(files_gt_10.items(), key=lambda kv: len(kv[1]), reverse=True):
        print(f"{path}: {len(shas)} commits linked to Bug issues")

zeppelin-server/src/main/java/org/apache/zeppelin/socket/NotebookServer.java: 83 commits linked to Bug issues
zeppelin-web/src/app/notebook/paragraph/paragraph.controller.js: 75 commits linked to Bug issues
zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Note.java: 66 commits linked to Bug issues
pom.xml: 58 commits linked to Bug issues
zeppelin-web/src/app/notebook/notebook.controller.js: 54 commits linked to Bug issues
zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/NotebookTest.java: 52 commits linked to Bug issues
jdbc/src/main/java/org/apache/zeppelin/jdbc/JDBCInterpreter.java: 48 commits linked to Bug issues
zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/Notebook.java: 47 commits linked to Bug issues
zeppelin-server/src/main/java/org/apache/zeppelin/server/ZeppelinServer.java: 41 commits linked to Bug issues
.travis.yml: 40 commits linked to Bug issues
zeppelin-zengine/src/main/java/org/apache/zeppelin/interpreter/InterpreterSetting.java: 39 c