In [1]:
!git clone https://github.com/DrishtiShrrrma/codeclarity.git


Cloning into 'codeclarity'...
remote: Enumerating objects: 166, done.[K
remote: Counting objects: 100% (166/166), done.[K
remote: Compressing objects: 100% (159/159), done.[K
remote: Total 166 (delta 66), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (166/166), 3.39 MiB | 6.08 MiB/s, done.
Resolving deltas: 100% (66/66), done.


In [2]:
%cd /content/codeclarity/generated_code_summaries/gemma-2-9b-it

/content/codeclarity/generated_code_summaries/gemma-2-9b-it


In [3]:
!ls -la

total 3080
drwxr-xr-x 2 root root   4096 Apr 28 23:42 .
drwxr-xr-x 6 root root   4096 Apr 28 23:42 ..
-rw-r--r-- 1 root root 442068 Apr 28 23:42 go_summary_all_languages_gemma-2-9b-it.json
-rw-r--r-- 1 root root 546182 Apr 28 23:42 javascript_summary_all_languages_gemma-2-9b-it.json
-rw-r--r-- 1 root root 541392 Apr 28 23:42 java_summary_all_languages_gemma-2-9b-it.json
-rw-r--r-- 1 root root 499648 Apr 28 23:42 php_summary_all_languages_gemma-2-9b-it.json
-rw-r--r-- 1 root root 565978 Apr 28 23:42 python_summary_all_languages_gemma-2-9b-it.json
-rw-r--r-- 1 root root 538047 Apr 28 23:42 ruby_summary_all_languages_gemma-2-9b-it.json


In [4]:
!pip install -qqq cohere

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import os
import json
import cohere

In [6]:
folder_path = "/content/codeclarity/generated_code_summaries/gemma-2-9b-it/"
output_dir = "evaluations"
os.makedirs(output_dir, exist_ok=True)

In [7]:
co = cohere.ClientV2("Z8VuQPeTvunJmHqe5lN65HaEES0BycC9nkCZ6OPJ")

In [8]:
def query_cohere(prompt):
    response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.message.content[0].text.strip()

In [9]:
def clean_response_text(text):
    text = text.strip()
    if text.startswith("```json"):
        text = text[len("```json"):].strip()
    elif text.startswith("```"):
        text = text[len("```"):].strip()
    if text.endswith("```"):
        text = text[:-len("```")].strip()
    return text


In [10]:
def build_comparison_prompt(code, english_summary, target_summary, target_language):
    prompt = f"""
You are a multilingual software expert evaluating two summaries of the same code snippet:
- One summary is written in English (reference).
- The other summary is written in {target_language}.

You are given:

CODE:
{code}

ENGLISH SUMMARY:
{english_summary}

{target_language.upper()} SUMMARY:
{target_summary}

Evaluate each summary based on:
- Accuracy (captures the functionality)
- Completeness (includes all important elements)
- Terminology Fidelity (correct use of programming terms)
- Language Quality (grammar and natural phrasing)

Instructions:
- Score 1–5 for each criterion, for both summaries.
- Compute an overall score (1–5) for each summary.
- Indicate whether the {target_language} summary is better, equal, or worse than the English one.
- Provide a short justification (1–3 sentences).

Respond **only** in this strict JSON format:
{{
  "score_english": {{
    "accuracy": 0,
    "completeness": 0,
    "terminology": 0,
    "language_quality": 0,
    "overall_score": 0
  }},
  "score_non_english": {{
    "accuracy": 0,
    "completeness": 0,
    "terminology": 0,
    "language_quality": 0,
    "overall_score": 0
  }},
  "comparison": "better" | "equal" | "worse",
  "justification": "your explanation here"
}}
"""
    return prompt


In [None]:
# Main processing loop
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        print(f"\nProcessing file: {filename}")

        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        evaluations_all_entries = []

        for idx, entry in enumerate(data):
            code = entry.get("code", "")
            english_summary = entry.get("summary_english", "")

            if not code or not english_summary:
                print(f"Skipping Entry {idx}: Missing code or English summary")
                continue

            evaluations_per_entry = []

            for key, target_summary in entry.items():
                if key.startswith("summary_") and key != "summary_english":
                    target_language = key.replace("summary_", "")

                    print(f"Evaluating Entry {idx} - Language: {target_language}")

                    prompt = build_comparison_prompt(
                        code=code,
                        english_summary=english_summary,
                        target_summary=target_summary,
                        target_language=target_language
                    )

                    try:
                        response_text = query_cohere(prompt)
                        cleaned_text = clean_response_text(response_text)
                        parsed_response = json.loads(cleaned_text)

                        # Print parsed evaluation
                        print(json.dumps(parsed_response, indent=2))

                        evaluations_per_entry.append({
                            "target_language": target_language,
                            "evaluation": parsed_response
                        })

                    except json.JSONDecodeError:
                        print(f"Failed to parse JSON for Entry {idx}, Language {target_language}")
                        print(response_text)
                        continue

            evaluations_all_entries.append({
                "entry_index": idx,
                "evaluations": evaluations_per_entry
            })

        # Save once per file
        output_filename = f"eval_{filename}"
        output_path = os.path.join(output_dir, output_filename)

        with open(output_path, 'w', encoding='utf-8') as f_out:
            json.dump(evaluations_all_entries, f_out, ensure_ascii=False, indent=2)

        print(f"Saved all evaluations in: {output_filename}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
}
Evaluating Entry 47 - Language: hindi
{
  "score_english": {
    "accuracy": 5,
    "completeness": 5,
    "terminology": 5,
    "language_quality": 5,
    "overall_score": 5
  },
  "score_non_english": {
    "accuracy": 4,
    "completeness": 4,
    "terminology": 4,
    "language_quality": 5,
    "overall_score": 4
  },
  "comparison": "worse",
  "justification": "The English summary is more comprehensive, accurately detailing the dynamic method invocation, exception handling, and return value logic. While the Hindi summary is grammatically correct and natural, it omits key details like specific exceptions and their handling, making it less complete and accurate compared to the English version."
}
Evaluating Entry 47 - Language: urdu
{
  "score_english": {
    "accuracy": 5,
    "completeness": 5,
    "terminology": 5,
    "language_quality": 5,
    "overall_score": 5
  },
  "score_non_english": {
    "accuracy": 4,
 