# Imports

In [1]:
from dotenv import load_dotenv
import logging

load_dotenv(dotenv_path=".env", verbose=True, override=True)
logging.basicConfig(level=logging.DEBUG)

In [2]:
import os
import json
import random
from collections import namedtuple
from typing import Any
import jsonpickle

from autocommit_evaluation.core.enums import EnvironmentKey
from autocommit_evaluation.cmg.evaluators import CommitMessageGenerator
from autocommit_evaluation.cmg import evaluator
from autocommit_evaluation.core import (
    main_few_shot_high_level_context_cmg_chain,
    main_zero_shot_low_level_context_cmg_chain,
    main_few_shot_low_level_context_cmg_chain,
    main_zero_shot_high_level_context_cmg_chain,
    main_high_level_context_chain
)
from autocommit.core.models import CommitDataModel
from autocommit_evaluation.datapreparation import context_generator, example_generator
from autocommit_evaluation.result.models import TestCaseScore
from autocommit_evaluation.result.processors import (
    RuleBasedCleaner,
    OutlierCleaner,
    ResultSummarizer
)

# Initialization

In [3]:
COMMIT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.json")
EVALUATION_COMMIT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.evaluation.json")
TEST_COMMIT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.test.json")
EXAMPLE_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "cmg", "commits.example.json")
RESULT_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "result", "evaluation.json")
SCORE_DATA_JSON_FILE_PATH = os.path.join("autocommit_evaluation", "data", "result", "score.json")

CONTEXT_DATA_PATH = os.path.join("autocommit_evaluation","data", "context")

DEFAULT_CONTEXT_GENERATION_OUTPUT_PATH = os.path.join(
    "autocommit_evaluation", "data", "context"
)
DEFAULT_HIGH_LEVEL_CONTEXT_OUTPUT_PATH = os.path.join(
    "out", "result", "highlevelcontext"
)
DEFAULT_CMG_OUTPUT_PATH = os.path.join("out", "result", "cmg")
DEFAULT_DIFF_CLASSIFICATION_OUTPUT_PATH = os.path.join(
    "out", "result", "diffclassification"
)
DEFAULT_EXAMPLE_GENERATION_OUTPUT_PATH = os.path.join("out", "result", "example")
DEFAULT_CLEANING_RESULT_OUTPUT_PATH = os.path.join("autocommit_evaluation", "data", "result", "evaluation.cleaned.json")
DEFAULT_SCORE_SUMMARY_OUTPUT_PATH = os.path.join("autocommit_evaluation", "data", "result", "score.summary.json")

DIFF_CLASSIFIER_CHAINS = [
    main_zero_shot_low_level_context_cmg_chain,
    main_zero_shot_high_level_context_cmg_chain,
]

HIGH_LEVEL_CONTEXT_CHAINS = [
    main_high_level_context_chain,
]

GENERATORS = [
    CommitMessageGenerator(
        "Main Few-Shot Low-Level Context Generator", main_few_shot_low_level_context_cmg_chain
    ),
    CommitMessageGenerator(
        "Main Zero-Shot High-Level Context Generator", main_zero_shot_high_level_context_cmg_chain
    ),
    CommitMessageGenerator(
        "Main Few-Shot High-Level Context Generator", main_few_shot_high_level_context_cmg_chain
    )
]

In [4]:
CONTEXT_GENERATION_OUTPUT_PATH = os.getenv(
        EnvironmentKey.CONTEXT_GENERATION_OUTPUT_PATH.value,
        DEFAULT_CONTEXT_GENERATION_OUTPUT_PATH,
    )

HIGH_LEVEL_CONTEXT_OUTPUT_PATH = os.getenv(
        EnvironmentKey.HIGH_LEVEL_CONTEXT_OUTPUT_PATH.value,
        DEFAULT_HIGH_LEVEL_CONTEXT_OUTPUT_PATH,
    )

CMG_OUTPUT_PATH = os.getenv(
        EnvironmentKey.CMG_OUTPUT_PATH.value, DEFAULT_CMG_OUTPUT_PATH
    )

DIFF_CLASSIFICATION_OUTPUT_PATH = os.getenv(
        EnvironmentKey.DIFF_CLASSIFICATION_OUTPUT_PATH.value,
        DEFAULT_DIFF_CLASSIFICATION_OUTPUT_PATH,
    )

EXAMPLE_GENERATION_OUTPUT_PATH = os.getenv(
        EnvironmentKey.EXAMPLE_GENERATION_OUTPUT_PATH.value,
        DEFAULT_EXAMPLE_GENERATION_OUTPUT_PATH,
    )

In [5]:
def get_commits(path: str) -> list[CommitDataModel]:
        with open(path, "r", encoding="utf-8") as file:
            json_string = file.read()

        return CommitDataModel.from_json(json_string)

COMMITS = get_commits(COMMIT_DATA_JSON_FILE_PATH)
EVALUATION_COMMITS = get_commits(EVALUATION_COMMIT_DATA_JSON_FILE_PATH)
TEST_COMMITS = get_commits(TEST_COMMIT_DATA_JSON_FILE_PATH)
EXAMPLE_COMMITS = get_commits(EXAMPLE_DATA_JSON_FILE_PATH)

# Generate Context

In [6]:
# all_commits = COMMITS + EVALUATION_COMMITS + TEST_COMMITS + EXAMPLE_COMMITS
# repo_name_filters = ["camel", "kafka"]

# context_generator.generate_context(all_commits, CONTEXT_GENERATION_OUTPUT_PATH, None)

# Generate Examples

In [7]:
# example_generator.generate_examples(EXAMPLE_COMMITS, EXAMPLE_GENERATION_OUTPUT_PATH)

# Generate Commit Message

In [8]:
# evaluator.evaluate(GENERATORS, COMMITS, CONTEXT_DATA_PATH, CMG_OUTPUT_PATH)

# CMG Cleaning

In [9]:
# def calculate_commit_subject_length(commit_message: str):
#     return len(commit_message.split("\n")[0])

# data = None

# with open(RESULT_DATA_JSON_FILE_PATH, "r", encoding="utf-8") as file:
#     json_string = file.read()
#     data = json.loads(json_string)

# random_state = random.getstate()

# for commit in data:

#     commit["generation_results"] = [
#         result for result in commit["generation_results"] if result["generator_id"] != "Main Zero-Shot Low-Level Context Generator"
#     ]
    
#     for result in commit["generation_results"]:    
#         commit_message = result.get("cleaned_commit_message") or result["commit_message"]
#         result["commit_subject_length"] = calculate_commit_subject_length(commit_message)

#     seed_value = int(commit["evaluation_id"][2:]) + 42
#     random.seed(seed_value)
#     random.shuffle(commit["generation_results"])
    
# random.setstate(random_state)

# with open(DEFAULT_CLEANING_RESULT_OUTPUT_PATH, "w", encoding="utf-8") as file:
#     json.dump(data, file)

In [None]:
# Form Result Processing
partial_cleaning = False

evaluation_data: list[Any] = None
data_json_string = None

with open(DEFAULT_CLEANING_RESULT_OUTPUT_PATH, "r", encoding="utf-8") as file:
    json_string = file.read()
    evaluation_data = json.loads(json_string)

with open(SCORE_DATA_JSON_FILE_PATH, "r", encoding="utf-8") as file:
    data_json_string = file.read()

test_case_scores: list[TestCaseScore] = TestCaseScore.from_json(data_json_string)
cleaners = [
    RuleBasedCleaner(evaluation_data),
    OutlierCleaner()
]
summarizer = ResultSummarizer(cleaners)
score_summaries = summarizer.summarize(test_case_scores, partial_cleaning)

json_string = jsonpickle.encode(score_summaries, unpicklable=False, indent=4)

with open(DEFAULT_SCORE_SUMMARY_OUTPUT_PATH, "w") as file:
    file.write(json_string)

Cleaning scores based on rules...
Total removed individual responses: 159
Percentage of removed individual responses: 57.19%
Total remaining individual responses: 119
Cleaning completed.
Cleaning outliers...
Generator ID: Main Few-Shot Low-Level Context Generator
Evaluation ID: TC001
Samples: [[3, 3, 1, 3], [4, 4, 4, 3, 3], [4, 4, 3, 4, 2], [4, 4, 4, 4, 3]]
Outlier indices: [3]
Outlier indices: [3, 4]
Outlier indices: [2, 4]
Outlier indices: [4]
Generator ID: Main Zero-Shot High-Level Context Generator
Evaluation ID: TC001
Samples: [[4, 4, 4, 4, 4], [4, 4, 4, 4, 4], [4, 4, 4, 4, 4], [4, 4, 4, 4, 4]]
Outlier indices: []
Outlier indices: []
Outlier indices: []
Outlier indices: []
Generator ID: Main Few-Shot High-Level Context Generator
Evaluation ID: TC001
Samples: [[4, 4, 4, 1, 4], [4, 4, 4, 3, 3], [4, 4, 4, 4, 4], [4, 4, 2, 4, 4]]
Outlier indices: [3]
Outlier indices: [3, 4]
Outlier indices: []
Outlier indices: [2]
Generator ID: Main Few-Shot Low-Level Context Generator
Evaluation ID: 