In [1]:
import pandas as pd

In [2]:
INPUT_FILE = "data/commits.txt"
COMMITS_CSV_FILE = "data/commits.csv"
FILES_CSV_FILE = "data/files.csv"

In [3]:
class FileInfo:
    def __init__(self, line):
        additions, deletions, self.file_path = line.split("\t")
        self.file_name = self.file_path.split("/")[-1]
        file_name_parts = self.file_name.split(".")
        if len(file_name_parts) > 1:
            self.extension = file_name_parts[-1]
        else:
            self.extension = None
        try:
            self.additions = int(additions)
        except:
            self.additions = None
        try:
            self.deletions = int(deletions)
        except:
            self.deletions = None

    def to_tuple(self):
        return (self.file_path, self.file_name, self.extension, self.additions, self.deletions)

In [4]:
class CommitInfo:
    def __init__(self, commit_raw_info, files_info):
        self.hash, self.author, self.date, self.message = commit_raw_info.strip().replace("<[", "").replace("]>", "").split(";;")
        self.file_infos = [FileInfo(line) for line in [line.strip() for line in files_info] if len(line) > 0]

    def to_general_info(self):
        return "{hash};;{author};;{date};;{message};;{additions};;{additions_kt};;{additions_py};;{deletions};;{deletions_kt};;{deletions_py}".format(
            hash=self.hash,
            author=self.author,
            date=self.date,
            message=self.message,
            additions=sum([file_info.additions for file_info in self.file_infos if file_info.additions is not None]),
            additions_kt=sum([file_info.additions for file_info in self.file_infos if file_info.extension == "kt" and file_info.additions is not None]),
            additions_py=sum([file_info.additions for file_info in self.file_infos if file_info.extension == "py" and file_info.additions is not None]),
            deletions=sum([file_info.deletions for file_info in self.file_infos if file_info.deletions is not None]),
            deletions_kt=sum([file_info.deletions for file_info in self.file_infos if file_info.extension == "kt" and file_info.deletions is not None]),
            deletions_py=sum([file_info.deletions for file_info in self.file_infos if file_info.extension == "py" and file_info.deletions is not None]),
        )

In [5]:
commit_infos = []
with open(INPUT_FILE, "r") as f:
    info = []
    for line in f:
        if line.startswith("<[") and line.endswith("]>\n"):
            if len(info) > 0:
                commit_infos.append(CommitInfo(info[0], info[1:]))
            info = [line]
        else:
            info.append(line)
    if len(info) > 0:
        commit_infos.append(CommitInfo(info[0], info[1:]))

In [6]:
with open(COMMITS_CSV_FILE, "w") as f:
    pass
with open(COMMITS_CSV_FILE, "a") as f:
    f.write("hash;;author;;date;;message;;additions;;additions_kt;;additions_py;;deletions;;deletions_kt;;deletions_py\n")
    for commit_info in commit_infos:
        if commit_info.date.split(" ")[-2] != "2024":
            continue
        f.write("{0}\n".format(commit_info.to_general_info()))

In [7]:
data = [file_info.to_tuple() for commit_info in commit_infos for file_info in commit_info.file_infos]
df = pd.DataFrame.from_records(data, columns=["file_path", "file_name", "extension", "additions", "deletions"])
agg_df = df.groupby(["file_path", "file_name", "extension"]).sum().reset_index().sort_values(by=["additions", "deletions"], ascending=[False, False])
filtered_df = agg_df[(~agg_df["file_path"].str.contains("=>"))&((agg_df["additions"]>0)|(agg_df["deletions"]>0))].reset_index(drop=True)
filtered_df.to_csv(FILES_CSV_FILE, index=False)