In [1]:
from typing import Dict, Literal, NamedTuple, Set
from collections import defaultdict

import pydriller
import pathlib

test_modules_path = pathlib.Path("test_submodules")
production_modules_path = pathlib.Path("production_submodules")
InstanceType = Literal["tdd", "same", "non-tdd", "no-match"]


class Instance(NamedTuple):
    code_commit_index: int
    test_commit_index: int
    code_name: str
    test_name: str


def is_test_file(filename: str) -> bool:
    return filename.endswith("Test.java") or filename.startswith("Test")


def get_code_file_name(filename: str) -> str:
    return filename.replace("Test", "")


def get_test_file_name(filename: str) -> str:
    return filename.replace(".java", "Test.java")


from typing import Tuple
from pydriller import ModificationType

repo_path = test_modules_path / "dubboj"

deleted_files: Dict[str, int] = {}
all_java_files: Dict[str, list[(int, int)]] = defaultdict(list)
full_path = str(repo_path.absolute())
merge_commits = {}
repo = pydriller.Repository(full_path, only_in_branch="3.2")
index = 0

for index, commit in enumerate(repo.traverse_commits()):
    if commit.merge:
        merge_commits[index] = commit

    for file in commit.modified_files:
        if file.filename.endswith(".java"):
            # if it's a java file
            match file.change_type:
                case ModificationType.ADD | ModificationType.RENAME:
                    if file.change_type == ModificationType.RENAME:
                        old_file_name = pathlib.Path(file.old_path).name
                        new_file_name = pathlib.Path(file.new_path).name
                        
                        if old_file_name == new_file_name:
                            # if the file is renamed but the name is the same, it's just a change of directory
                            continue
                        
                    if file.filename in all_java_files:
                        # seen before
                        if all_java_files[file.filename] == index:
                            # in the same commit, which shouldn't happen
                            raise Exception("what")

                        # if it's not in the same commit, check if it's deleted before
                        if file.filename in deleted_files:
                            # and has been deleted before, add a new record to it

                            # add a new entry, with the deleted index, and newly added index
                            all_java_files[file.filename].append((deleted_files[file.filename], index))

                            # remove the deleted index
                            del deleted_files[file.filename]
                    else:
                        # not seen before
                        # -1 as deleted index means it's a new file and is seen for the first time
                        all_java_files[file.filename].append((-1, index))

                    if file.change_type == ModificationType.RENAME:
                        # mark the old file as deleted
                        old_file_name = pathlib.Path(file.old_path).name
                        deleted_files[old_file_name] = index
                        # print(f"{file.old_path} => {file.new_path}")
                case ModificationType.COPY:
                    # I don't think anyone is using copy in git?
                    raise Exception("Do not support copy")
                case ModificationType.DELETE:
                    # if it's deleted, add it to the deleted_files
                    deleted_files[file.filename] = index
                case _:
                    # modify, copy, unmerged, unknown
                    # we don't care in this case
                    pass

for deleted_file in deleted_files:
    all_java_files[deleted_file].append((deleted_files[deleted_file], float("inf")))


In [2]:

counters: Dict[InstanceType, Dict[str, list[Tuple[int, int]]]] = defaultdict(lambda: defaultdict(list))
test_files = set()

for file, file_indices in all_java_files.items():
    if is_test_file(file):
        test_files.add(file)

        code_file_name = file.replace("Test", "")

        if code_file_name == '.java':
            # not a valid test file 
            counters["no-match"][file] = []
            continue

        code_file_indices = all_java_files.get(code_file_name, None)

        if code_file_indices is None:
            # no code file
            counters["no-match"][file] = []
            continue

        for code_file_del_index, code_file_add_index in code_file_indices:
            if code_file_add_index == float("inf"):
                break
                
            has_test = None
            for test_file_del_index, test_file_add_index in file_indices:
                if test_file_add_index == float("inf"):
                    break
                # we need to go through all test file indices because it could be 
                # -1 ... test_add ... test_del ... code_add ... test_add
                # ---------------     ----------------------------------
                #  test index 1                 test index 2
                # when the code add sees the first index, it thinks the code has a test
                # but the second index shows that the test is deleted, and added after the code
                if test_file_del_index < code_file_add_index and test_file_add_index <= code_file_add_index:
                    # test_del ... test_add ... code_add 
                    # this means test is before code
                    has_test = test_file_add_index
                elif test_file_del_index < code_file_add_index < test_file_add_index:
                    # test_del ... code_add ... test_add 
                    # this means test is after code
                    has_test = None
                else:
                    # code_add ... test_del ... test_add
                    # otherwise, the test is added later than the code, so no has_test flag 
                    # just break
                    break

            if has_test is None:
                counters["non-tdd"][file].append((has_test, code_file_add_index))
            elif has_test == code_file_add_index:
                counters["same"][file].append((has_test, code_file_add_index))
            elif has_test < code_file_add_index:
                counters["tdd"][file].append((has_test, code_file_add_index))
            else:
                raise Exception("Impossible")

print("tdd: ", len(counters["tdd"]))
print("same: ", len(counters["same"]))
print("non-tdd: ", len(counters["non-tdd"]))
print("no-match: ", len(counters["no-match"]))
print("test file", len(test_files))
print("all java files", len(all_java_files))


# test_repo = pydriller.Repository("/Users/flicker_soul/Documents/Developer/tmp/sdpcw2")
# for commit in test_repo.traverse_commits():
#     print(list(map(lambda x: (x.filename, x.change_type), commit.modified_files)))


tdd:  35
same:  426
non-tdd:  466
no-match:  213
test file 1089
all java files 4759


In [3]:
sorted(counters["tdd"].items())
# all_java_files["AbsentConfiguratorTest.java"]

[('AbstractStreamTest.java', [(5201, 5362)]),
 ('AnnotationUtilsTest.java', [(3975, 3977)]),
 ('ClassUtilsTest.java', [(2435, 3381), (2435, 3980)]),
 ('ClientStreamTest.java', [(5201, 5362)]),
 ('ConfigTest.java', [(231, 1530), (231, 5170)]),
 ('ConsumerContextFilterTest.java', [(1, 3965)]),
 ('DubboApplicationContextInitializerTest.java', [(2448, 4570)]),
 ('DubboBootstrapTest.java', [(3519, 3966)]),
 ('GenericProtobufObjectOutputTest.java', [(3400, 3841)]),
 ('GenericProtobufSerializationTest.java', [(3400, 3841)]),
 ('GrpcProtocolTest.java', [(4731, 6288)]),
 ('JSONTest.java', [(229, 5512)]),
 ('LoadBalanceTest.java', [(1, 2386)]),
 ('MetadataTest.java', [(2669, 4577)]),
 ('MethodTest.java', [(2585, 3255)]),
 ('MetricsFilterTest.java', [(3254, 3345)]),
 ('NacosMetadataReportTest.java', [(3418, 4638)]),
 ('NettyServerTest.java', [(4518, 6571)]),
 ('ParametersTest.java', [(1, 216)]),
 ('PerformanceConsumerTest.java', [(1, 1439)]),
 ('PerformanceProviderTest.java', [(1, 1439)]),
 ('Pro

In [4]:
set(counters["tdd"].keys()).union(counters["same"].keys()).intersection(counters["non-tdd"].keys())

{'AbstractStreamTest.java',
 'AnnotatedBeanDefinitionRegistryUtilsTest.java',
 'AnnotationUtilsTest.java',
 'CacheableFailbackRegistryTest.java',
 'ClassUtilsTest.java',
 'ClientStreamTest.java',
 'CompilerTest.java',
 'DubboBootstrapTest.java',
 'ExecutionListTest.java',
 'GrpcProtocolTest.java',
 'JettyHttpBinderTest.java',
 'NettyServerTest.java',
 'ParseUtilsTest.java',
 'ProtobufUtilsTest.java',
 'ReferenceConfigCacheTest.java',
 'ReferenceConfigTest.java',
 'RegistryStatCompositeTest.java',
 'RestServiceTest.java',
 'RouteRuleTest.java',
 'RouteRuleUtilsTest.java',
 'RouteUtilsTest.java',
 'RpcResultTest.java',
 'ServerStreamTest.java',
 'ServiceConfigTest.java',
 'ServiceDefinitionBuilderTest.java',
 'StatusTest.java',
 'TagRouterTest.java',
 'TestConsumerConfiguration.java',
 'TestTypeBuilder.java',
 'TomcatHttpBinderTest.java',
 'TraceFilterTest.java',
 'UrlUtilsTest.java',
 'ZkClientWrapperTest.java',
 'ZkclientZookeeperClientTest.java',
 'ZkclientZookeeperTransporterTest.jav

In [5]:
counters["non-tdd"]['AbstractStreamTest.java']


[(None, 4518)]

In [6]:
counters["same"]['AbstractStreamTest.java']


[]

In [7]:
counters["tdd"]['AbstractStreamTest.java']

[(5201, 5362)]

In [8]:
all_java_files["AbstractStreamTest.java"]

[(-1, 5201), (5362, inf)]

In [9]:
all_java_files["AbstractStream.java"]

[(-1, 4518), (5362, 5362)]

In [21]:
with open("dubbo.txt", "w") as f:
    f.write("\n".join(commit.msg + " => " + " ".join(map(lambda x: f"{x.filename}|{x.change_type}", commit.modified_files)) for commit in repo.traverse_commits()))

In [15]:
mg = merge_commits[5361]

In [18]:
p1 = list(pydriller.Repository(full_path, only_commits=['8686d8d54a0653f4be50ab15c7b372a1cbc00c68']).traverse_commits())
p2 = list(pydriller.Repository(full_path, only_commits=['e4c036d287c33a47a1d4f44d66e959e18a5b3ab8']).traverse_commits())

<pydriller.domain.commit.Commit at 0x111ff9910>