In [1]:
from datetime import datetime
from typing import Dict, Literal, NamedTuple, Set, List
from collections import defaultdict

import pydriller
import pathlib

test_modules_path = pathlib.Path("test_submodules")
production_modules_path = pathlib.Path("production_submodules")
InstanceType = Literal["tdd", "same", "non-tdd", "no-match"]


class Instance(NamedTuple):
    code_commit_index: int
    test_commit_index: int
    code_name: str
    test_name: str


def is_test_file(filename: str) -> bool:
    return filename.endswith("Test.java") or filename.startswith("Test")


def get_code_file_name(filename: str) -> str:
    return filename.replace("Test", "")


def get_test_file_name(filename: str) -> str:
    return filename.replace(".java", "Test.java")


from typing import Tuple
from pydriller import ModificationType
from collections import deque

start_commit_index = (datetime(1900,1,1), "0" * 40)
future_commit_index = (datetime(3000,1,1), "0" * 40)

def traverse(path, all_java_files, to_add_files):
    latest = list(pydriller.Repository(path).traverse_commits())[-1]
    traverse_commit_linearly(path, latest, set(), all_java_files, to_add_files, "")
    
    for (to_add_name, _), index in to_add_files.items():
        all_java_files[to_add[0]].append((start_commit_index, index))

def traverse_commit_linearly(path, commit: pydriller.Commit, hist: Set[pydriller.Commit], all_java_files: Dict[str, deque[(int, int)]], to_add_files: Dict[Tuple[str, str], Tuple[datetime, str]], branch) -> None:
    commits = [commit]
    while len(commits) == 1:
        commit = commits[0]
        if commit in hist:
            return
        else:
            hist.add(commit)
        
        commit_time = commit.committer_date
        commit_hash = commit.hash

        commit_index = (commit_hash, commit_time)
        record_files(commit.modified_files, all_java_files, to_add_files, commit_index, branch)
        
        commits = [next(pydriller.Repository(path, only_commits=[parent]).traverse_commits()) for parent in commit.parents]
        
    if len(commits) > 1:
        for index, commit in enumerate(commits):
            traverse_commit_linearly(path, commit, hist, all_java_files, to_add_files, branch + str(index))
    
            
def record_files(modified_files: List[pydriller.ModifiedFile], all_java_files, to_add_files, commit_index, branch):
    for commit_file in reversed(modified_files):
        if commit_file.filename.endswith(".java"):
            # if it's a java file
            match commit_file.change_type:
                case ModificationType.ADD | ModificationType.RENAME:
                    if commit_file.change_type == ModificationType.RENAME:
                        old_file_name = pathlib.Path(commit_file.old_path).name
                        new_file_name = pathlib.Path(commit_file.new_path).name

                        if old_file_name == new_file_name:
                            # if the file is renamed but the name is the same, it's just a change of directory
                            continue

                    if commit_file.filename in to_add_files:
                        raise Exception(f"{commit_file.filename} added twice")

                    to_add_files[(commit_file.filename, branch)] = commit_index

                    if commit_file.change_type == ModificationType.RENAME:
                        # mark the old file as deleted
                        old_file_name = pathlib.Path(commit_file.old_path).name
                        to_add_info = (old_file_name, branch)
                        if to_add_info in to_add_files:
                            all_java_files[old_file_name].append((commit_index, to_add_files[to_add_info]))
                            del to_add_files[to_add_info]
                        else:
                            all_java_files[to_add_info].append((commit_index, future_commit_index))
                        # print(f"{file.old_path} => {file.new_path}")
                case ModificationType.COPY:
                    # I don't think anyone is using copy in git?
                    raise Exception("Do not support copy")
                case ModificationType.DELETE:
                    # if it's deleted, add it to the deleted_files
                    to_add_info = (commit_file.filename, branch)
                    if to_add_info in to_add_files:
                        all_java_files[commit_file.filename].append((commit_index, to_add_files[to_add_info]))
                        del to_add_files[to_add_info]
                    else:
                        all_java_files[commit_file.filename].append((commit_index, future_commit_index))
                case _:
                    # modify, copy, unmerged, unknown
                    # we don't care in this case
                    pass

# 
# def traverse_commit(path: str, commit: pydriller.Commit, all_java_files:Dict[str, deque[(int, int)]], to_add_files: Dict[Tuple[str, str], Tuple[datetime, str]]) -> None:
#     traverse_queue: deque[Tuple[pydriller.Commit, str]] = deque([(commit, "")])
# 
#     commit_hist: Set[pydriller.Commit] = set()
#         
#     while traverse_queue:
#         commit_to_analyze, branch = traverse_queue.popleft()
#         
#         if commit_to_analyze in commit_hist:
#             continue
#         else:
#             commit_hist.add(commit_to_analyze)
#         
#         commit_time = commit_to_analyze.committer_date
#         commit_hash = commit_to_analyze.hash
#         
#         for index, parent in enumerate(commit_to_analyze.parents):
#             traverse_queue.append(
#                 (next(pydriller.Repository(path, single=parent).traverse_commits()), str(index) + branch)
#             )
#             
#         commit_index = (commit_hash, commit_time)
# 
#         for commit_file in reversed(list(commit_to_analyze.modified_files)):
#             if commit_file.filename.endswith(".java"):
#                 # if it's a java file
#                 match commit_file.change_type:
#                     case ModificationType.ADD | ModificationType.RENAME:
#                         if commit_file.change_type == ModificationType.RENAME:
#                             old_file_name = pathlib.Path(commit_file.old_path).name
#                             new_file_name = pathlib.Path(commit_file.new_path).name
# 
#                             if old_file_name == new_file_name:
#                                 # if the file is renamed but the name is the same, it's just a change of directory
#                                 continue
#                         
#                         if commit_file.filename in to_add_files:
#                             raise Exception(f"{commit_file.filename} added twice")
#                         
#                         to_add_files[(commit_file.filename, branch)] = commit_index
# 
#                         if commit_file.change_type == ModificationType.RENAME:
#                             # mark the old file as deleted
#                             old_file_name = pathlib.Path(commit_file.old_path).name
#                             to_add_info = (old_file_name, branch)
#                             if to_add_info in to_add_files:
#                                 all_java_files[old_file_name].append((commit_index, to_add_files[to_add_info]))
#                                 del to_add_files[to_add_info]
#                             else:
#                                 all_java_files[to_add_info].append((commit_index, future_commit_index))
#                             # print(f"{file.old_path} => {file.new_path}")
#                     case ModificationType.COPY:
#                         # I don't think anyone is using copy in git?
#                         raise Exception("Do not support copy")
#                     case ModificationType.DELETE:
#                         # if it's deleted, add it to the deleted_files
#                         to_add_info = (commit_file.filename, branch)
#                         if to_add_info in to_add_files:
#                             all_java_files[commit_file.filename].append((commit_index, to_add_files[to_add_info]))
#                             del to_add_files[to_add_info]
#                         else:
#                             all_java_files[commit_file.filename].append((commit_index, future_commit_index))
#                     case _:
#                         # modify, copy, unmerged, unknown
#                         # we don't care in this case
#                         pass
#     
#     for (added_file, branch), add_commit_index in to_add_files.items():
#         all_java_files[added_file].append((start_commit_index, add_commit_index))

In [2]:
repo_path = test_modules_path / "dubboj"
full_path = str(repo_path.absolute())
newest_commit = list(pydriller.Repository(full_path).traverse_commits())[-1]
result = defaultdict(deque)
to_add = {}

traverse(full_path, result, to_add)

OSError: [Errno 24] Too many open files

In [None]:

# 
# deleted_files: Dict[str, int] = {}
# all_java_files: Dict[str, list[(int, int)]] = defaultdict(list)
# full_path = str(repo_path.absolute())
# merge_commits = {}
# repo = pydriller.Repository(full_path, only_in_branch="3.2")
# index = 0
# 
# 
# 
# for index, commit in enumerate(repo.traverse_commits()):
#     if commit.merge:
#         merge_commits[index] = commit
# 
#     for file in commit.modified_files:
#         if file.filename.endswith(".java"):
#             # if it's a java file
#             match file.change_type:
#                 case ModificationType.ADD | ModificationType.RENAME:
#                     if file.change_type == ModificationType.RENAME:
#                         old_file_name = pathlib.Path(file.old_path).name
#                         new_file_name = pathlib.Path(file.new_path).name
# 
#                         if old_file_name == new_file_name:
#                             # if the file is renamed but the name is the same, it's just a change of directory
#                             continue
# 
#                     if file.filename in all_java_files:
#                         # seen before
#                         if all_java_files[file.filename] == index:
#                             # in the same commit, which shouldn't happen
#                             raise Exception("what")
# 
#                         # if it's not in the same commit, check if it's deleted before
#                         if file.filename in deleted_files:
#                             # and has been deleted before, add a new record to it
# 
#                             # add a new entry, with the deleted index, and newly added index
#                             all_java_files[file.filename].append((deleted_files[file.filename], index))
# 
#                             # remove the deleted index
#                             del deleted_files[file.filename]
#                     else:
#                         # not seen before
#                         # -1 as deleted index means it's a new file and is seen for the first time
#                         all_java_files[file.filename].append((-1, index))
# 
#                     if file.change_type == ModificationType.RENAME:
#                         # mark the old file as deleted
#                         old_file_name = pathlib.Path(file.old_path).name
#                         deleted_files[old_file_name] = index
#                         # print(f"{file.old_path} => {file.new_path}")
#                 case ModificationType.COPY:
#                     # I don't think anyone is using copy in git?
#                     raise Exception("Do not support copy")
#                 case ModificationType.DELETE:
#                     # if it's deleted, add it to the deleted_files
#                     deleted_files[file.filename] = index
#                 case _:
#                     # modify, copy, unmerged, unknown
#                     # we don't care in this case
#                     pass
# 
# for deleted_file in deleted_files:
#     all_java_files[deleted_file].append((deleted_files[deleted_file], float("inf")))


In [None]:

# test_repo = pydriller.Repository("/Users/flicker_soul/Documents/Developer/tmp/sdpcw2")
# for commit in test_repo.traverse_commits():
#     print(list(map(lambda x: (x.filename, x.change_type), commit.modified_files)))
