# Redo graph.pkl

In [1]:
import sys, os
from gc import garbage

sys.path.append(os.path.abspath("../.."))
from src.inspector_git import IGLogReader, GitLogDTO
from src.jira_miner.models import JsonFileFormatJira
from src.github_miner import JsonFileFormatGithub
import json
from pathlib import Path
from pprint import pprint
import re
from datetime import datetime
import pickle
from flat_graph import *

# JSON
path_jira = "../../test-input/jira-miner/ZEPPELIN-detailed-issues.json"
path_github = "../../test-input/github-miner/githubProject.json"
# IGLOG
path_inspector_git = "../../test-input/inspector-git/zeppelin.iglog"



def load_from_json(model_cls, file_path: str):
    """Load JSON from a given path and validate it with the provided model class."""
    file_path = Path(file_path)

    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    with file_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    return model_cls.model_validate(data)

def load_jsons():
    jira_data_loaded = load_from_json(JsonFileFormatJira, path_jira)
    github_data_loaded = load_from_json(JsonFileFormatGithub, path_github)
    return jira_data_loaded, github_data_loaded

ig_log_reader = IGLogReader()



jira_data, github_data = load_jsons()
inspector_git_data = ig_log_reader.read(path_inspector_git)
graph = Graph()

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x735167b22ea0>>
Traceback (most recent call last):
  File "/home/vortex/.conda/envs/Vortex/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [2]:
import importlib
import flat_graph  # <-- matches the filename flat_graph.py
importlib.reload(flat_graph)

from flat_graph import Graph  # or whatever you need


graph.add_inspector_git_data(inspector_git_data)


In [3]:
print(graph.summary())

~~~~ Graph summary ~~~~
commits: 5512
git_users: 602
files: 2448

issue_statuses: 0
issue_types: 0
issue_status_categories: 0
jira_users: 0
issues: 0

pull_requests: 0
git_hub_users: 0
git_hub_commits: 0

nodes: 8562
edges: 66499


In [4]:
fisiere = [f.path for f in graph.files.values()]
strings_sorted = sorted(fisiere)
# Save to file
with open("flat.txt", "w") as f:
    for item in strings_sorted:
        f.write(item + "\n")

In [1]:
def add_jira_data(graph:Graph, jira_data: JsonFileFormatJira):
    def add_issue_statuses():
        for status in jira_data.issueStatuses:
            category = IssueStatusCategory(
                key=status.statusCategory.key,
                name=status.statusCategory.name
            )
            status = IssueStatus(
                id=status.id,
                name=status.name,
            )

            graph.add_issue_status_category(category)
            graph.add_issue_status(status)

            edge = IssueStatusIssueStatusCategoryEdge(
                issue_status=status,
                issue_status_category=category,
            )
            graph.add_edge(edge)

    def add_issue_types():
        for issue_type in jira_data.issueTypes:
            issue_type = IssueType(
                id=issue_type.id,
                name=issue_type.name,
                description=issue_type.description,
                isSubTask=issue_type.isSubTask,
            )
            graph.add_issue_type(issue_type)

    def add_users():
        for user in jira_data.users:
            jira_user = JiraUser(
                key=user.key,
                name=user.name,
                link=user.self_,
            )
            graph.add_jira_user(jira_user)

    def add_issues():
        for issue in jira_data.issues:
            i = Issue(
                id=issue.id,
                key=issue.key,
                summary=issue.summary,
                createdAt=issue.created,
                updatedAt=issue.updated,
            )
            graph.add_issue(i)

            issue_status = graph.get_issue_status(issue.status.id)
            issue_issue_status_edge = IssueIssueStatusEdge(
                issue_status=issue_status,
                issue=i
            )
            graph.add_edge(issue_issue_status_edge)

            issue_type = graph.get_issue_type(issue.issueType)
            issue_issue_type_edge = IssueIssueTypeEdge(
                issue_type=issue_type,
                issue=i
            )
            graph.add_edge(issue_issue_type_edge)

            reporter = graph.get_jira_user(issue.reporterId)
            reporter_edge = IssueJiraUserEdge(
                jira_user=reporter,
                issue=i,
                role= "reporter"
            )
            graph.add_edge(reporter_edge)
            if issue.creatorId is not None:
                creator = graph.get_jira_user(issue.creatorId)
                creator_edge = IssueJiraUserEdge(
                jira_user=creator,
                issue=i,
                role= "creator"
                )
                graph.add_edge(creator_edge)

            if issue.assigneeId is not None:
                assignee = graph.get_jira_user(issue.assigneeId)
                assignee_edge = IssueJiraUserEdge(
                    jira_user=assignee,
                    issue=i,
                    role= "assignee"
                )
                graph.add_edge(assignee_edge)

    def add_edge_if_absent(graph, edge: IssueIssueEdge):
        """Add an IssueIssueEdge only if it doesn't already exist."""
        existing_edges = graph.adjacency.get(edge.child.dict_key(), {}).get("issues", [])
        if not any(
            isinstance(e, IssueIssueEdge) and e.normalized_key() == edge.normalized_key()
            for e in existing_edges
        ):
            graph.add_edge(edge)

    def make_issue_parent_connections():
        for jira_issue in jira_data.issues:
            current_issue = graph.get_issue(jira_issue.key)

            # 🔹 Connect to parent
            if jira_issue.parent is not None:
                parent_issue = graph.get_issue(jira_issue.parent)
                edge = IssueIssueEdge(child=current_issue, parent=parent_issue)
                add_edge_if_absent(graph, edge)

            # 🔹 Connect to subtasks
            for subtask_id in jira_issue.subTasks or []:
                child_issue = graph.get_issue(subtask_id)
                edge = IssueIssueEdge(child=child_issue, parent=current_issue)
                add_edge_if_absent(graph, edge)


    add_issue_statuses()
    add_issue_types()
    add_users()
    add_issues()
    make_issue_parent_connections()

add_jira_data(graph, jira_data)

def add_github_data(graph:Graph, github_data: JsonFileFormatGithub):
    a = 0
    cre = 0
    m = 0
    for pr in github_data.pullRequests:
        # ADD PR
        pull_request = PullRequest(
            number=pr.number,
            title=pr.title,
            state=pr.state,
            changedFiles=pr.changedFiles,
            createdAt=pr.createdAt,
            updatedAt=pr.updatedAt,
            body=pr.body,
            mergedAt=pr.mergedAt,
            closedAt=pr.closedAt,
        )
        graph.add_pull_request(pull_request)

        # ADD ALL USERS
        for assignee in pr.assignees:
            assignee_git_hub_user = GitHubUser(
                url=assignee.url,
                login=assignee.login,
                name=assignee.name,
            )
            graph.add_git_hub_user(assignee_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= assignee_git_hub_user,
                role = "assignee"
            )
            graph.add_edge(edge)
            a += 1

        if pr.createdBy:
            creator_git_hub_user = GitHubUser(
                url=pr.createdBy.url,
                login=pr.createdBy.login,
                name=pr.createdBy.name,
            )
            graph.add_git_hub_user(creator_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= creator_git_hub_user,
                role = "creator"
            )
            graph.add_edge(edge)
            cre += 1

        if pr.mergedBy:
            merger_git_hub_user = GitHubUser(
                name=pr.mergedBy.name,
                url=pr.mergedBy.url,
                login=pr.mergedBy.login,
            )
            graph.add_git_hub_user(merger_git_hub_user)

            edge = PullRequestGitHubUserEdge(
                pr = pull_request,
                git_hub_user= merger_git_hub_user,
                role = "merger"
            )
            graph.add_edge(edge)
            m += 1

        # ADD ALL COMMITS
        for c in pr.commits:
            commit = GitHubCommit(
                sha=c.sha,
                date=c.date,
                message=c.message,
                changedFiles=c.changedFiles,
            )
            graph.add_git_hub_commit(commit)

            edge = PullRequestGitHubCommitEdge(
                commit = commit,
                pr = pull_request,
            )
            graph.add_edge(edge)

add_github_data(graph, github_data)

def save_pickle(obj, var_name: str, base_dir: str = "pickle_data") -> Path:
    """
    Save a Python object using pickle in a directory relative to the notebook.

    Args:
        obj: The Python object to save.
        var_name: Name of the variable (used as filename).
        base_dir: Relative directory to save pickles (default: "pickle_data").

    Returns:
        Path to the saved pickle file.
    """
    # Create the directory if it doesn't exist
    save_dir = Path(base_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # Build full path
    pickle_path = save_dir / f"{var_name}.pkl"

    # Save object
    with open(pickle_path, "wb") as f:
        pickle.dump(obj, f)

    print(f"Saved {var_name} to {pickle_path}")
    return pickle_path

# Save your loaded data
save_pickle(graph, "graph")
print(graph.summary())

Saved graph to pickle_data/graph.pkl
~~~~ Graph summary ~~~~
commits: 5512
git_users: 602
files: 5644

issue_statuses: 48
issue_types: 7
issue_status_categories: 3
jira_users: 2008
issues: 6202

pull_requests: 5022
git_hub_users: 690
git_hub_commits: 17869

nodes: 43607
edges: 122314


# Make graph fully connected

In [3]:
def link_issues_with_git_commits(graph: Graph):
    # Build one regex to match all issue keys
    issue_keys = [re.escape(issue.key) for issue in graph.issues.values()]
    issue_pattern = re.compile(r'\b(' + '|'.join(issue_keys) + r')\b', re.IGNORECASE)

    links = 0
    commits_liked_with_issues = 0
    for commit in graph.commits.values():
        if not commit.message:
            continue

        matches = issue_pattern.findall(commit.message)

        if len(matches) > 0:
            commits_liked_with_issues += 1
        for match in set(matches):
            issue = graph.get_issue(match.upper())
            edge = GitCommitIssueEdge(
                git_commit = commit,
                issue = issue,
            )
            graph.add_edge(edge)
            links += 1

    print(f"There are {links} Issue–Commit edges")
    print(f"Commits liked with issues: {commits_liked_with_issues}")

def link_pull_request_with_issue(graph: Graph, jira_data: JsonFileFormatJira):
    # Build one regex for all issue keys
    issue_keys = [re.escape(issue.key) for issue in graph.issues.values()]
    issue_pattern = re.compile(r'\b(' + '|'.join(issue_keys) + r')\b', re.IGNORECASE)

    links = 0
    prs_with_issues = 0

    for pr in graph.pull_requests.values():
        text = (pr.title or "") + " " + (pr.body or "")
        matches = issue_pattern.findall(text)

        if matches:
            prs_with_issues += 1
        for match in set(matches):
            issue = graph.get_issue(match.upper())
            if issue:
                edge = PullRequestIssueEdge(
                    pr=pr,
                    issue=issue,
                )
                graph.add_edge(edge)
                links += 1

    print(f"Created {links} PR–Issue edges")
    print(f"PRs linked with issues: {prs_with_issues}")

    def extract_pr_number(text: str) -> int | None:
        match = re.search(r'#(\d+)', text)
        if match:
            return int(match.group(1))
        return None

    for issue in jira_data.issues:
        issues_pr_links = set()
        for change in issue.changes:
            for item in change.items:
                if item.toString and "Pull Request #" in item.toString:
                    issues_pr_links.add(item.toString)
        if len(issues_pr_links) > 0:
            i = graph.get_issue(issue.key)
            for link in issues_pr_links:
                pr = graph.get_pull_request(extract_pr_number(link))
                if not pr:
                    print(f"Unknown reference in {i.key} of a pull request: {link}")
                    continue

                edge = PullRequestIssueEdge(
                    pr=pr,
                    issue=i,
                )
                graph.add_edge(edge)

def link_pull_requests_with_git_commits(graph: Graph, min_similarity = 0.85):
    counts = {
        "merged_as": 0,
        "linked_via_issue": 0
    }
    for pr in graph.pull_requests.values():
        pr_commits = [c.commit for c in graph.adjacency.get(pr.dict_key(), {}).get("git_hub_commits", [])]

        for pr_commit in pr_commits:
            # 1. Exact SHA match
            if git_commit := graph.commits.get(f"GitCommit:{pr_commit.sha}"):
                graph.add_edge(GitCommitPullRequestEdge(
                    git_commit=git_commit,
                    pr=pr,
                    relation="merged_as"
                ))
                counts["merged_as"] += 1

        # 2. Issue-based linking (fallback)
        issues_linked_to_pr = [
            edge.issue for edge in graph.adjacency.get(pr.dict_key(), {}).get("issues", [])
        ]
        for issue in issues_linked_to_pr:
            git_commits_for_issue = [
                edge.git_commit for edge in graph.adjacency.get(issue.dict_key(), {}).get("git_commits", [])
            ]

            existing_edges = set()

            for git_commit in git_commits_for_issue:
                edge = GitCommitPullRequestEdge(
                    git_commit=git_commit,
                    pr=pr,
                    relation="linked_via_issue"
                )
                if edge not in existing_edges:
                    graph.add_edge(edge)
                    counts["linked_via_issue"] += 1
                    existing_edges.add(edge)

    for key in counts.keys():
        print(f"{key}: {counts[key]}")




def save_pickle(obj, var_name: str, base_dir: str = "pickle_data") -> Path:
    """
    Save a Python object using pickle in a directory relative to the notebook.

    Args:
        obj: The Python object to save.
        var_name: Name of the variable (used as filename).
        base_dir: Relative directory to save pickles (default: "pickle_data").

    Returns:
        Path to the saved pickle file.
    """
    # Create the directory if it doesn't exist
    save_dir = Path(base_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # Build full path
    pickle_path = save_dir / f"{var_name}.pkl"

    # Save object
    with open(pickle_path, "wb") as f:
        pickle.dump(obj, f)

    print(f"Saved {var_name} to {pickle_path}")
    return pickle_path



link_issues_with_git_commits(graph)
link_pull_request_with_issue(graph, jira_data)
# 8 min 33 sec
link_pull_requests_with_git_commits(graph)

save_pickle(graph, "graph")


There are 3642 Issue–Commit edges
Commits liked with issues: 3209
Created 4385 PR–Issue edges
PRs linked with issues: 3974
Unknown reference in ZEPPELIN-6282 of a pull request: This issue links to "GitHub Pull Request #5026 (Web Link)"
Unknown reference in ZEPPELIN-6283 of a pull request: This issue links to "GitHub Pull Request #5030 (Web Link)"
Unknown reference in ZEPPELIN-6280 of a pull request: This issue links to "GitHub Pull Request #5028 (Web Link)"
Unknown reference in ZEPPELIN-6281 of a pull request: This issue links to "GitHub Pull Request #5029 (Web Link)"
Unknown reference in ZEPPELIN-6276 of a pull request: This issue links to "GitHub Pull Request #5027 (Web Link)"
Unknown reference in ZEPPELIN-6164 of a pull request: This issue links to "GitHub Pull Request #5031 (Web Link)"
Unknown reference in ZEPPELIN-1326 of a pull request: This issue links to "GitHub Pull Request # (Web Link)"
Unknown reference in ZEPPELIN-6181 of a pull request: This issue links to "GitHub Pull Req

PosixPath('pickle_data/graph.pkl')

# Altele