# Imports

In [1]:
from dotenv import load_dotenv
import logging

load_dotenv(dotenv_path=".env", verbose=True, override=True)
logging.basicConfig(level=logging.DEBUG)

In [2]:
import os
import json
import random
import re
from collections import namedtuple
from typing import Any
import jsonpickle
import statistics

from autocommit_evaluation.core.enums import EnvironmentKey
from autocommit_evaluation.cmg.evaluators import CommitMessageGenerator
from autocommit_evaluation.cmg import evaluator
from autocommit_evaluation.core import (
    main_few_shot_high_level_context_cmg_chain,
    main_zero_shot_low_level_context_cmg_chain,
    main_few_shot_low_level_context_cmg_chain,
    main_zero_shot_high_level_context_cmg_chain,
    main_high_level_context_chain
)
from autocommit.core.models import CommitDataModel
from autocommit_evaluation.datapreparation import context_generator, example_generator

# Initialization

In [3]:
COMMIT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.json")
EVALUATION_COMMIT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.evaluation.json")
TEST_COMMIT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.test.json")
EXAMPLE_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.example.json")
RESULT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "result", "evaluation.json")
SCORE_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "result", "score.json")

CONTEXT_DATA_PATH = os.path.join("autocommit_evaluation","data", "context")

DEFAULT_CONTEXT_GENERATION_OUTPUT_PATH = os.path.join(
    "autocommit_evaluation", "data", "context"
)
DEFAULT_HIGH_LEVEL_CONTEXT_OUTPUT_PATH = os.path.join(
    "out", "result", "highlevelcontext"
)
DEFAULT_CMG_OUTPUT_PATH = os.path.join("out", "result", "cmg")
DEFAULT_DIFF_CLASSIFICATION_OUTPUT_PATH = os.path.join(
    "out", "result", "diffclassification"
)
DEFAULT_EXAMPLE_GENERATION_OUTPUT_PATH = os.path.join("out", "result", "example")
DEFAULT_CLEANING_RESULT_OUTPUT_PATH = os.path.join("autocommit_evaluation", "data", "result", "evaluation.cleaned.json")
DEFAULT_SCORE_SUMMARY_OUTPUT_PATH = os.path.join("autocommit_evaluation", "data", "result", "score.summary.json")

DIFF_CLASSIFIER_CHAINS = [
    main_zero_shot_low_level_context_cmg_chain,
    main_zero_shot_high_level_context_cmg_chain,
]

HIGH_LEVEL_CONTEXT_CHAINS = [
    main_high_level_context_chain,
]

GENERATORS = [
    CommitMessageGenerator(
        "Main Few-Shot Low-Level Context Generator", main_few_shot_low_level_context_cmg_chain
    ),
    CommitMessageGenerator(
        "Main Zero-Shot High-Level Context Generator", main_zero_shot_high_level_context_cmg_chain
    ),
    CommitMessageGenerator(
        "Main Few-Shot High-Level Context Generator", main_few_shot_high_level_context_cmg_chain
    )
]

In [4]:
CONTEXT_GENERATION_OUTPUT_PATH = os.getenv(
        EnvironmentKey.CONTEXT_GENERATION_OUTPUT_PATH.value,
        DEFAULT_CONTEXT_GENERATION_OUTPUT_PATH,
    )

HIGH_LEVEL_CONTEXT_OUTPUT_PATH = os.getenv(
        EnvironmentKey.HIGH_LEVEL_CONTEXT_OUTPUT_PATH.value,
        DEFAULT_HIGH_LEVEL_CONTEXT_OUTPUT_PATH,
    )

CMG_OUTPUT_PATH = os.getenv(
        EnvironmentKey.CMG_OUTPUT_PATH.value, DEFAULT_CMG_OUTPUT_PATH
    )

DIFF_CLASSIFICATION_OUTPUT_PATH = os.getenv(
        EnvironmentKey.DIFF_CLASSIFICATION_OUTPUT_PATH.value,
        DEFAULT_DIFF_CLASSIFICATION_OUTPUT_PATH,
    )

EXAMPLE_GENERATION_OUTPUT_PATH = os.getenv(
        EnvironmentKey.EXAMPLE_GENERATION_OUTPUT_PATH.value,
        DEFAULT_EXAMPLE_GENERATION_OUTPUT_PATH,
    )

In [5]:
def get_commits(path: str) -> list[CommitDataModel]:
        with open(path, "r", encoding="utf-8") as file:
            json_string = file.read()

        return CommitDataModel.from_json(json_string)

COMMITS = get_commits(COMMIT_DATA_JSON_FILE_PATH)
EVALUATION_COMMITS = get_commits(EVALUATION_COMMIT_DATA_JSON_FILE_PATH)
TEST_COMMITS = get_commits(TEST_COMMIT_DATA_JSON_FILE_PATH)
EXAMPLE_COMMITS = get_commits(EXAMPLE_DATA_JSON_FILE_PATH)

# Generate Context

In [6]:
# all_commits = COMMITS + EVALUATION_COMMITS + TEST_COMMITS + EXAMPLE_COMMITS
# repo_name_filters = ["camel", "kafka"]

# context_generator.generate_context(all_commits, CONTEXT_GENERATION_OUTPUT_PATH, None)

# Generate Examples

In [7]:
# example_generator.generate_examples(EXAMPLE_COMMITS, EXAMPLE_GENERATION_OUTPUT_PATH)

# Generate Commit Message

In [8]:
# evaluator.evaluate(GENERATORS, COMMITS, CONTEXT_DATA_PATH, CMG_OUTPUT_PATH)

# CMG Cleaning

In [9]:
# def calculate_commit_subject_length(commit_message: str):
#     return len(commit_message.split("\n")[0])

# data = None

# with open(RESULT_DATA_JSON_FILE_PATH, "r", encoding="utf-8") as file:
#     json_string = file.read()
#     data = json.loads(json_string)

# random_state = random.getstate()

# for commit in data:

#     commit["generation_results"] = [
#         result for result in commit["generation_results"] if result["generator_id"] != "Main Zero-Shot Low-Level Context Generator"
#     ]
    
#     for result in commit["generation_results"]:    
#         commit_message = result.get("cleaned_commit_message") or result["commit_message"]
#         result["commit_subject_length"] = calculate_commit_subject_length(commit_message)

#     seed_value = int(commit["evaluation_id"][2:]) + 42
#     random.seed(seed_value)
#     random.shuffle(commit["generation_results"])
    
# random.setstate(random_state)

# with open(DEFAULT_CLEANING_RESULT_OUTPUT_PATH, "w", encoding="utf-8") as file:
#     json.dump(data, file)

In [10]:
# Form Result Processing

class CommitMessageScore:
    def __init__(self):
        self.rationality_score: int = 0
        self.comprehensiveness_score: int = 0
        self.conciseness_score: int = 0
        self.correctness_score: int = 0

class GeneratorScore:
    def __init__(self):
        self.generator_id: str = ""
        self.scores: list[CommitMessageScore] = []

class TestCaseScore:
    def __init__(self):
        self.evaluation_id: str = ""
        self.scores: list[GeneratorScore] = []

class ScoreSummary:
    def __init__(self):
        self.generator_id: str = ""
        self.rationality_score: float = 0
        self.comprehensiveness_score: float = 0
        self.conciseness_score: float = 0
        self.correctness_score: float = 0

def json_to_object(name: str, data: Any) -> Any:
    if isinstance(data, dict):
        return type(name, (object,), {k: json_to_object(k, v) for k, v in data.items()})()
    elif isinstance(data, list):
        return [json_to_object(name, item) for item in data]
    else:
        return data

def is_rationality_score_valid(
        commit_message_score: CommitMessageScore, commit_message: str) -> bool:
    jira_ticket_pattern = r'\b[A-Z]+-\d+\b'

    if commit_message_score.rationality_score == 3:
        if re.search(jira_ticket_pattern, commit_message):
            return False

    elif commit_message_score.rationality_score == 4:
        if not re.search(jira_ticket_pattern, commit_message):
            return False

    return True

def is_conciseness_score_valid(
        commit_message_score: CommitMessageScore, commit_subject_length: int) -> bool:
    if commit_message_score.conciseness_score != 1:
        if (commit_subject_length > 100):
            return False
    
    return True

def is_correctness_score_valid(
        commit_message_score: CommitMessageScore, 
        commit_message: str,
        jira_url: str) -> bool:
    
    ground_truth_ticket_id = jira_url.split("/")[-1]
    ticket_ids = re.findall(r'\b[A-Z]+-\d+\b', commit_message)

    if commit_message_score.correctness_score == 4:
        if (len(ticket_ids) > 1 
            or (len(ticket_ids) == 1 and ticket_ids[0] != ground_truth_ticket_id)):
            return False

    return True

def clean_scores(test_case_scores: list[TestCaseScore]) -> list[TestCaseScore]:
    print("Cleaning scores...")

    data: list[Any] = None
    cleaned_test_case_scores: list[TestCaseScore] = []
    total_individual_responses = 0
    total_invalid_individual_responses = 0

    with open(DEFAULT_CLEANING_RESULT_OUTPUT_PATH, "r", encoding="utf-8") as file:
        json_string = file.read()
        data = json.loads(json_string)

    for test_case_score in test_case_scores:
        commit = next((commit for commit in data if commit["evaluation_id"] == test_case_score.evaluation_id), None)

        if commit is None:
            continue
        
        invalid_indexes = set()

        total_individual_responses += len(test_case_score.scores[0].scores)

        for generator_score in test_case_score.scores:
            for idx, commit_message_score in enumerate(generator_score.scores):
                commit_message = next((result for result in commit["generation_results"] if result["generator_id"] == generator_score.generator_id), None)

                if commit_message is None:
                    continue

                is_rationality_valid = is_rationality_score_valid(
                    commit_message_score, 
                    commit_message.get("cleaned_commit_message") or commit_message["commit_message"])
                is_comprehensiveness_valid = True
                is_conciseness_valid = is_conciseness_score_valid(
                    commit_message_score, 
                    commit_message["commit_subject_length"])
                is_correctness_valid = is_correctness_score_valid(
                    commit_message_score, 
                    commit_message.get("cleaned_commit_message") or commit_message["commit_message"],
                    commit["jira_url"])
                
                is_valid = is_rationality_valid and is_comprehensiveness_valid and is_conciseness_valid and is_correctness_valid
                
                if not is_valid:
                    invalid_indexes.add(idx)

        valid_test_case_score = TestCaseScore()
        valid_test_case_score.evaluation_id = test_case_score.evaluation_id
        valid_test_case_score.scores = []

        for generator_score in test_case_score.scores:
            valid_generator_score = GeneratorScore()
            valid_generator_score.generator_id = generator_score.generator_id
            valid_generator_score.scores = []

            for idx, commit_message_score in enumerate(generator_score.scores):
                if idx in invalid_indexes:
                    continue

                valid_generator_score.scores.append(commit_message_score)

            valid_test_case_score.scores.append(valid_generator_score)

        cleaned_test_case_scores.append(valid_test_case_score)
        total_invalid_individual_responses += len(invalid_indexes)

    print(f"Total invalid individual responses: {total_invalid_individual_responses}")
    print(f"Percentage of invalid individual responses: {total_invalid_individual_responses / total_individual_responses * 100:.2f}%")

    print("Finished cleaning scores.\n")
    return cleaned_test_case_scores

def get_outlier_indexes(samples: list[int]) -> set[int]:
    median = statistics.median(samples)
    mad = statistics.median([abs(x - median) for x in samples])
    made = 1.483 * mad

    lower_bound = median - 3 * made
    upper_bound = median + 3 * made

    return {
        i for i, x in enumerate(samples)
        if x < lower_bound or x > upper_bound
    }

def remove_outliers(test_case_scores: list[TestCaseScore]) -> list[TestCaseScore]:
    print("Removing outliers...")

    cleaned_test_case_scores: list[TestCaseScore] = []
    total_individual_responses = 0
    total_outlier_individual_responses = 0

    for test_case_score in test_case_scores:
        outlier_indexes = set()

        total_individual_responses += len(test_case_score.scores[0].scores)

        for generator_score in test_case_score.scores:
            if (len(generator_score.scores) >= 4):
                samples_collection = [[] for _ in range(4)]

                for commit_message_score in generator_score.scores:
                    samples_collection[0].append(commit_message_score.rationality_score)
                    samples_collection[1].append(commit_message_score.comprehensiveness_score)
                    samples_collection[2].append(commit_message_score.conciseness_score)
                    samples_collection[3].append(commit_message_score.correctness_score)

                print(f"Generator ID: {generator_score.generator_id}")
                print(f"Evaluation ID: {test_case_score.evaluation_id}")
                print(f"Samples: {samples_collection}")
                for samples in samples_collection:
                    new_outlier_indexes = get_outlier_indexes(samples)
                    print(f"Outlier indexes: {new_outlier_indexes}")
                    outlier_indexes = outlier_indexes.union(new_outlier_indexes)

        valid_test_case_score = TestCaseScore()
        valid_test_case_score.evaluation_id = test_case_score.evaluation_id
        valid_test_case_score.scores = []

        for generator_score in test_case_score.scores:
            valid_generator_score = GeneratorScore()
            valid_generator_score.generator_id = generator_score.generator_id
            valid_generator_score.scores = []

            for idx, commit_message_score in enumerate(generator_score.scores):
                if idx in outlier_indexes:
                    continue

                valid_generator_score.scores.append(commit_message_score)

            valid_test_case_score.scores.append(valid_generator_score)

        cleaned_test_case_scores.append(valid_test_case_score)
        total_outlier_individual_responses += len(outlier_indexes)

    print(f"Total outlier individual responses: {total_outlier_individual_responses}")
    print(f"Percentage of outlier individual responses: {total_outlier_individual_responses / total_individual_responses * 100:.2f}%")

    print("Finished cleaning scores.\n")
    return cleaned_test_case_scores


score_data = None

with open(SCORE_DATA_JSON_FILE_PATH, "r", encoding="utf-8") as file:
    score_data = json.load(file)

test_case_scores: list[TestCaseScore] = json_to_object("TestCaseScore", score_data)
test_case_scores = clean_scores(test_case_scores)
test_case_scores = remove_outliers(test_case_scores)

score_summaries: list[ScoreSummary] = []

for test_case_score in test_case_scores:
    for generatorScore in test_case_score.scores:
        score_summary = next((score for score in score_summaries if score.generator_id == generatorScore.generator_id), None)

        if score_summary is None:
            score_summary = ScoreSummary()
            score_summary.generator_id = generatorScore.generator_id
            score_summaries.append(score_summary)

        for commitMessageScore in generatorScore.scores:
            score_summary.rationality_score += commitMessageScore.rationality_score
            score_summary.comprehensiveness_score += commitMessageScore.comprehensiveness_score
            score_summary.conciseness_score += commitMessageScore.conciseness_score
            score_summary.correctness_score += commitMessageScore.correctness_score

for score_summary in score_summaries:
    score_count = sum([
        sum([
            len(generator_score.scores) 
            for generator_score 
            in test_case_score.scores
            if generator_score.generator_id == score_summary.generator_id
        ])
        for test_case_score
        in test_case_scores
    ])
    
    score_summary.rationality_score /= score_count
    score_summary.comprehensiveness_score /= score_count
    score_summary.conciseness_score /= score_count
    score_summary.correctness_score /= score_count

json_string = jsonpickle.encode(score_summaries, unpicklable=False, indent=4)

with open(DEFAULT_SCORE_SUMMARY_OUTPUT_PATH, "w") as file:
    file.write(json_string)

Cleaning scores...
Total invalid individual responses: 159
Percentage of invalid individual responses: 57.19%
Finished cleaning scores.

Removing outliers...
Generator ID: Main Few-Shot Low-Level Context Generator
Evaluation ID: TC001
Samples: [[3, 3, 1, 3], [4, 4, 3, 3], [4, 3, 4, 2], [4, 4, 4, 3]]
Outlier indexes: {2}
Outlier indexes: set()
Outlier indexes: set()
Outlier indexes: {3}
Generator ID: Main Zero-Shot High-Level Context Generator
Evaluation ID: TC001
Samples: [[4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4], [4, 4, 4, 4]]
Outlier indexes: set()
Outlier indexes: set()
Outlier indexes: set()
Outlier indexes: set()
Generator ID: Main Few-Shot High-Level Context Generator
Evaluation ID: TC001
Samples: [[4, 4, 1, 4], [4, 4, 3, 3], [4, 4, 4, 4], [4, 2, 4, 4]]
Outlier indexes: {2}
Outlier indexes: set()
Outlier indexes: set()
Outlier indexes: {1}
Generator ID: Main Few-Shot Low-Level Context Generator
Evaluation ID: TC007
Samples: [[3, 2, 3, 1], [2, 4, 3, 3], [4, 3, 4, 4], [2, 3, 4, 4]]