# Grab Repository data      

In [1]:
from dotenv import load_dotenv
from helper.general import split_by_date, generate_value_in_buckets, truncate_to_same_length, aggregate_by_date, get_repository_paths
from helper.significance import check_normality_of_buckets, use_normality_results_for_significance_independent
import logging
import pandas as pd
import ast
import os
import matplotlib.pyplot as plt

load_dotenv(override=True)

True

In [2]:
REPO_PATHS = os.getenv('STORAGE_DIRECTORIES')
repository_directories = ast.literal_eval(REPO_PATHS) if REPO_PATHS else []
INTRO_DATE = os.getenv('INTRO_DATE')
BUCKET_SIZE = int(os.getenv('BUCKET_SIZE'))


storage_path = f'../results/metric_calculation_{BUCKET_SIZE}/relative_churned_LOC_per_{BUCKET_SIZE}_days.csv'

# Get only first-level subfolders in the repository directories
repositories = get_repository_paths(repository_directories)

print(repositories)

['/Users/annemariewittig/Master/repositoryanalysis/HRE/reflex-cep']


# Generate File Level Code Churn per Repository

## Amount of changes per file per commit

In [3]:
import pandas as pd
import os
import ijson
import datetime

import numpy as np

def calculate_relative_churned_LOC(dataframe):
    dataframe['M1_relative_churned_LOC_manually_parsed'] = np.where(
        dataframe["line_count"] != 0,
        (dataframe["calculated_loc_changed"] + dataframe["calculated_loc_added"]) / dataframe["line_count"],
        None
    )
    dataframe['M1_relative_churned_LOC_auto'] = np.where(
        dataframe["line_count"] != 0,
        dataframe["loc_added"] / dataframe["line_count"],
        None
    )
    return dataframe

def calculate_relative_deleted_LOC(dataframe):
    dataframe['M2_relative_deleted_LOC_manually_parsed'] = np.where(
        dataframe["line_count"] != 0,
        dataframe["calculated_loc_removed"] / dataframe["line_count"],
        None
    )
    dataframe['M2_relative_deleted_LOC_auto'] = np.where(
        dataframe["line_count"] != 0,
        dataframe["loc_removed"] / dataframe["line_count"],
        None
    )
    return dataframe

def calculate_relative_churned_vs_deleted_LOC(dataframe):
    dataframe['M7_relative_churned_deleted_LOC_manually_parsed'] = np.where(
        dataframe["calculated_loc_removed"] != 0,
        (dataframe["calculated_loc_changed"] + dataframe["calculated_loc_added"]) / dataframe["calculated_loc_removed"],
        None
    )
    dataframe['M7_relative_churned_deleted_LOC_auto'] = np.where(
        dataframe["loc_removed"] != 0,
        dataframe["calculated_loc_added"] / dataframe["loc_removed"],
        None
    )
    return dataframe

for repository in repositories:
    commit_file = f"{repository}/commits.csv"
    if not os.path.exists(commit_file):
        print(f"File not found: {commit_file}. Skipping repository.")
        continue

    print(f"Processing repository: {repository}")
    commit_df = pd.read_csv(commit_file)
    commit_df = commit_df[['sha', 'date']]
    print(f"Max commit date: {commit_df['date'].max()}")

    commit_files_file = f"{repository}/files.json"
    output_file = commit_file.replace('.csv', '_file_level_changes.csv')

    if os.path.exists(output_file):
        print(f"{output_file} already exists. Skipping.")
        continue

    sha_to_date = dict(zip(commit_df['sha'], commit_df['date']))
    all_new_rows = []
    counter = 0

    with open(commit_files_file, 'rb') as f:
        # ijson.items(f, 'item') streams each item in the top-level array
        for commit in ijson.items(f, 'item'):
            sha = commit.get("commit_sha")
            files_changed = commit.get("commit_files", [])

            for file in files_changed:
                new_row = {
                    'sha': sha,
                    'date': sha_to_date.get(sha, None),
                    'file_type': file.get("file_path", "").split('.')[-1] if file.get("file_path") else None,
                    'loc_added': file.get("loc_added"),
                    'loc_removed': file.get("loc_removed"),
                    'calculated_loc_added': file.get("calculated_loc_added"),
                    'calculated_loc_removed': file.get("calculated_loc_removed"),
                    'calculated_loc_changed': file.get("calculated_loc_changed"),
                    'line_count': file.get("line_count")
                }
                all_new_rows.append(new_row)

            counter += 1
            if counter % 10000 == 0:
                print(f"Transformed {counter} commits")

    print(f"{datetime.datetime.now()}Finished transforming {counter} commits")

    if all_new_rows:
        new_dataframe = pd.DataFrame(all_new_rows)
        # Combine original commit_df with file-level changes
        full_df = pd.concat([commit_df, new_dataframe], ignore_index=True)
        full_df = full_df[full_df['line_count'] != 0]
        full_df = full_df.dropna(subset=['file_type', 'loc_added', 'loc_removed', 'calculated_loc_added', 'calculated_loc_removed', 'calculated_loc_changed', 'line_count'])

        # Calculate metrics
        full_df = calculate_relative_churned_LOC(full_df)
        full_df = calculate_relative_deleted_LOC(full_df)
        full_df = calculate_relative_churned_vs_deleted_LOC(full_df)
        

        # Write to CSV
        full_df.to_csv(output_file, index=False)
        print(f"{datetime.datetime.now()}: Saved results to {output_file}")
    else:
        print(f"No data to append for {repository}")


Processing repository: /Users/annemariewittig/Master/repositoryanalysis/HRE/reflex-cep
Max commit date: 2025-05-15 09:34:20+02:00
Transformed 10000 commits
2025-07-08 13:50:37.592152Finished transforming 15326 commits
2025-07-08 13:50:38.346389: Saved results to /Users/annemariewittig/Master/repositoryanalysis/HRE/reflex-cep/commits_file_level_changes.csv
