In [None]:
!pip install openai langchain tiktoken pdfplumber streamlit google-generativeai


Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting streamlit
  Downloading streamlit-1.52.2-py3-none-any.whl.metadata (9.8 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pd

In [None]:
!pip install -U openai


Collecting openai
  Downloading openai-2.15.0-py3-none-any.whl.metadata (29 kB)
Downloading openai-2.15.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 2.12.0
    Uninstalling openai-2.12.0:
      Successfully uninstalled openai-2.12.0
Successfully installed openai-2.15.0


In [None]:
import os
from openai import OpenAI

client = OpenAI(
    api_key="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",
    base_url="https://openrouter.ai/api/v1"
)


In [None]:
os.environ["OPENROUTER_API_KEY"] = "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"

In [None]:
def load_file(path):
    with open(path, "r") as f:
        return f.read()

problem_description = load_file("/content/problem_description.txt")
generated_code = load_file("/content/sample_code.java")
test_cases = load_file("/content/testcases.txt")

SRS Document

In [None]:
GEVAL_METRICS_SRS = [
    {
        "name": "Requirements Completeness",
        "description": (
            "Does the SRS comprehensively cover all necessary functional and non-functional "
            "requirements, including system features, constraints, assumptions, and dependencies?"
        )
    },
    {
        "name": "Clarity & Unambiguity",
        "description": (
            "Are the requirements clearly stated, precise, and free from ambiguity, vague terms, "
            "or subjective language that could lead to multiple interpretations?"
        )
    },
    {
        "name": "Consistency",
        "description": (
            "Are there any conflicting, duplicated, or contradictory requirements within the SRS, "
            "or between functional and non-functional sections?"
        )
    },
    {
        "name": "Verifiability & Testability",
        "description": (
            "Can each requirement be objectively verified or tested through inspection, analysis, "
            "demonstration, or test cases?"
        )
    },
    {
        "name": "Structure & Standard Compliance",
        "description": (
            "Is the SRS well-structured and logically organized, following standard SRS formats "
            "(e.g., IEEE 830 / IEEE 29148) with proper sections and hierarchy?"
        )
    }
]


In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

srs_text = extract_text_from_pdf("generated_srs.pdf")

In [None]:
def build_geval_prompt_srs_pdf(srs_pdf_text, metrics):
    metric_text = "\n".join([
        f"{i+1}. {m['name']}: {m['description']}"
        for i, m in enumerate(metrics)
    ])

    return f"""
You are an expert software requirements analyst and SRS reviewer.

Evaluate the following Software Requirements Specification (SRS) document
generated by a Large Language Model. The SRS content has been extracted
from a PDF file.

SRS DOCUMENT (Extracted from PDF):
{srs_pdf_text}

EVALUATION METRICS:
{metric_text}

For each metric:
- Assign a score from 1 to 10 (higher is better).
- Provide a concise and technically sound justification.
- Evaluate strictly based on SRS best practices and standards
  (e.g., IEEE 830 / IEEE 29148).

Return STRICTLY in valid JSON format.
Do NOT include markdown, explanations, or additional text.

Expected JSON format:
{{
  "Metric Name": {{
      "score": <int>,
      "reason": "<short justification>"
  }},
  ...
  "Overall Score": <float>
}}
"""


In [None]:
def run_geval_srs(prompt):
    """
    Executes GEVAL evaluation for an SRS document (text extracted from PDF).

    Args:
        prompt (str): Fully constructed GEVAL prompt for SRS evaluation.

    Returns:
        str: Raw JSON response from the LLM evaluator.
    """
    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_tokens=900
    )

    return response.choices[0].message.content


In [None]:
# Step 1: Extract text from the SRS PDF
srs_pdf_text = extract_text_from_pdf("/content/generated_srs.pdf")

# Step 2: Build GEVAL prompt for SRS evaluation
prompt_srs = build_geval_prompt_srs_pdf(
    srs_pdf_text,
    GEVAL_METRICS_SRS
)

raw_output_srs = run_geval_srs(prompt_srs)
print(raw_output_srs)


Here is the evaluation of the SRS document in JSON format:

```json
{
  "Requirements Completeness": {
    "score": 6,
    "reason": "The SRS covers the main functional and non-functional requirements, but lacks details on book management, user authentication, and error handling."
  },
  "Clarity & Unambiguity": {
    "score": 8,
    "reason": "The requirements are generally clear and concise, but some terms like 'unique Book ID' could be clarified."
  },
  "Consistency": {
    "score": 9,
    "reason": "The SRS is well-organized and consistent in its structure, but there is a minor inconsistency in the use of 'shall' and 'should' in the functional and non-functional requirements."
  },
  "Verifiability & Testability": {
    "score": 7,
    "reason": "Most requirements can be objectively verified or tested, but some, like NFR1, are too vague to be tested directly."
  },
  "Structure & Standard Compliance": {
    "score": 5,
    "reason": "The SRS does not follow the standard IEEE 830/I

Code Generation

In [None]:
GEVAL_METRICS_CG = [
    {
        "name": "Functional Correctness",
        "description": "Does the code correctly solve the problem for all valid inputs and edge cases?"
    },
    {
        "name": "Compilation / Execution Validity",
        "description": "Is the code free from syntax and runtime errors?"
    },
    {
        "name": "Algorithmic Efficiency",
        "description": "Is the time and space complexity appropriate for the problem constraints?"
    },
    {
        "name": "Readability & Maintainability",
        "description": "Is the code clean, readable, and well-structured?"
    },
    {
        "name": "Edge Case Handling",
        "description": "Does the code correctly handle boundary and corner cases?"
    }
]


In [None]:
def build_geval_prompt_cg(problem, code, metrics):
    metric_text = "\n".join([
        f"{i+1}. {m['name']}: {m['description']}"
        for i, m in enumerate(metrics)
    ])

    return f"""
You are an expert software engineer and code reviewer.

Evaluate the following generated code using the metrics below.

PROBLEM DESCRIPTION:
{problem}

GENERATED CODE:
{code}

EVALUATION METRICS:
{metric_text}

For each metric:
- Give a score (1-10) and short reason for each metric.
- Return ONLY valid JSON in this format:

Return STRICTLY in valid JSON format:
{{
  "Metric Name": {{
      "score": <int>,
      "reason": "<text>"
  }},
  ...
  "Overall Score": <float>
}}
"""


In [None]:
def run_geval_cg(prompt):
    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=700
    )
    return response.choices[0].message.content

In [None]:
prompt_cg = build_geval_prompt_cg(problem_description, generated_code, GEVAL_METRICS_CG)

raw_output_cg = run_geval_cg(prompt_cg)
print(raw_output_cg)


Here is the evaluation of the generated code:

```json
{
  "Functional Correctness": {
    "score": 8,
    "reason": "The code correctly implements the required functionality, but it does not handle the case where the number of copies is set to a negative value when adding a book."
  },
  "Compilation / Execution Validity": {
    "score": 10,
    "reason": "The code is free from syntax and runtime errors."
  },
  "Algorithmic Efficiency": {
    "score": 6,
    "reason": "The time complexity of the code is O(n) for the listBooks method, which is acceptable for a small library. However, for a large library, this could be improved by using a data structure that allows for faster lookup, such as a HashSet or a database."
  },
  "Readability & Maintainability": {
    "score": 9,
    "reason": "The code is well-structured and easy to read, but it could benefit from more comments and documentation to explain the purpose of each method and the logic behind the code."
  },
  "Edge Case Handling

Test Cases

In [None]:
GEVAL_METRICS_T = [
    {
        "name": "Test Case Coverage",
        "description": (
            "Do the generated test cases adequately cover normal, boundary, and extreme input scenarios "
            "defined by the problem constraints?"
        )
    },
    {
        "name": "Edge Case Effectiveness",
        "description": (
            "Do the test cases include meaningful edge and corner cases that are likely to expose defects "
            "in incorrect or incomplete implementations?"
        )
    },
    {
        "name": "Input Validity & Constraints Compliance",
        "description": (
            "Are all test inputs valid, well-formed, and compliant with the stated input constraints "
            "(data types, ranges, formats, and sizes)?"
        )
    },
    {
        "name": "Expected Output Correctness",
        "description": (
            "Are the expected outputs accurate and logically consistent with the problem specification "
            "for each generated test case?"
        )
    },
    {
        "name": "Redundancy & Diversity",
        "description": (
            "Do the test cases avoid unnecessary duplication while maintaining sufficient diversity "
            "to test different execution paths?"
        )
    }
]


In [None]:
def build_geval_prompt_tc(problem, code, testcases, metrics):
    metric_text = "\n".join([
        f"{i+1}. {m['name']}: {m['description']}"
        for i, m in enumerate(metrics)
    ])

    return f"""
You are an expert software testing engineer and quality analyst.

Your task is to evaluate the quality of test cases generated by a Large Language Model (LLM).
The goal is to assess whether these test cases are effective at validating the correctness
and robustness of the given code with respect to the problem description.

PROBLEM DESCRIPTION:
{problem}

REFERENCE CODE UNDER TEST:
{code}

GENERATED TEST CASES:
{testcases}

EVALUATION METRICS:
{metric_text}

For each metric:
- Assign a score between 1 and 10
- Provide a concise, technical justification focused on test effectiveness

Return STRICTLY valid JSON in the following format:
{{
  "Metric Name": {{
      "score": <int>,
      "reason": "<concise explanation>"
  }},
  ...
  "Overall Score": <float>
}}

Important:
- Do NOT evaluate code style or performance
- Focus ONLY on test coverage, correctness, diversity, and defect-detection capability
- Penalize redundant, invalid, or weak test cases
"""


In [None]:
def run_testcase_geval_tc(prompt):
    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_tokens=700
    )

    return response.choices[0].message.content.strip()


In [None]:
prompt_tc = build_geval_prompt_tc(
    problem=problem_description,
    code=generated_code,
    testcases=test_cases,
    metrics=GEVAL_METRICS_T
)

raw_output_tc = run_testcase_geval_tc(prompt_tc)
print(raw_output_tc)


Here's the evaluation of the generated test cases based on the provided metrics:

```json
{
  "Test Case Coverage": {
    "score": 8,
    "reason": "The test cases cover most normal and boundary input scenarios, but lack extreme input scenarios, such as adding a book with a negative number of copies or issuing a book with a non-existent ID."
  },
  "Edge Case Effectiveness": {
    "score": 6,
    "reason": "The test cases include some meaningful edge cases, such as adding a duplicate book and issuing a book when no copies are available, but miss others, like returning a book with no copies available."
  },
  "Input Validity & Constraints Compliance": {
    "score": 9,
    "reason": "The test inputs are generally valid and well-formed, but the input for Test Case 4 (issueBook(999)) is not compliant with the problem constraints, as the book ID is not present in the system."
  },
  "Expected Output Correctness": {
    "score": 9,
    "reason": "The expected outputs are mostly accurate and

In [None]:
import json
import re

def extract_json_from_string(text):
    # 1. Try direct JSON parsing (VERY IMPORTANT)
    try:
        return json.loads(text)
    except Exception:
        pass

    # 2. Try JSON inside ```json ``` block
    match = re.search(r'```json\s*({[\s\S]*?})\s*```', text)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass

    # 3. Try JSON inside generic ``` ``` block
    match = re.search(r'```\s*({[\s\S]*?})\s*```', text)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass

    return None



'''def extract_srs_evaluation(text):
    """
    Extracts SRS evaluation.
    - If JSON is present → return parsed JSON (dict)
    - Else → return clean plain text
    """
    # Try JSON extraction first
    json_data = extract_json_from_string(text)
    if json_data:
        return json_data

    # Fallback to plain text
    return text.strip()
'''
# ----------------------------
# Correct extraction
# ----------------------------

srs_evaluation = extract_json_from_string(raw_output_srs) or raw_output_srs.strip()
cg_json = extract_json_from_string(raw_output_cg)
tc_json = extract_json_from_string(raw_output_tc)


# ----------------------------
# Combined output (sequence intact)
# ----------------------------

combined_evaluations = {
    "srs_evaluation": srs_evaluation,
    "code_generation_evaluation": cg_json,
    "test_cases_evaluation": tc_json
}

output_filename = "combined_evaluations.json"

with open(output_filename, "w") as f:
    json.dump(combined_evaluations, f, indent=2)

print(f"Combined evaluations saved to {output_filename}")

# Display the content of the combined file
with open(output_filename, "r") as f:
    print("\nContent of combined_evaluations.json:\n")
    print(f.read())


Combined evaluations saved to combined_evaluations.json

Content of combined_evaluations.json:

{
  "srs_evaluation": {
    "Requirements Completeness": {
      "score": 6,
      "reason": "The SRS covers the main functional and non-functional requirements, but lacks details on book management, user authentication, and error handling."
    },
    "Clarity & Unambiguity": {
      "score": 8,
      "reason": "The requirements are generally clear and concise, but some terms like 'unique Book ID' could be clarified."
    },
    "Consistency": {
      "score": 9,
      "reason": "The SRS is well-organized and consistent in its structure, but there is a minor inconsistency in the use of 'shall' and 'should' in the functional and non-functional requirements."
    },
    "Verifiability & Testability": {
      "score": 7,
      "reason": "Most requirements can be objectively verified or tested, but some, like NFR1, are too vague to be tested directly."
    },
    "Structure & Standard Complianc

In [None]:
import json

def format_evaluation_to_text(eval_data, section_title):
    text_output = f"{section_title} Evaluation\n\n"
    if eval_data:
        for metric_name, details in eval_data.items():
            if metric_name == "Overall Score":
                continue
            score = details.get("score", "N/A")
            reason = details.get("reason", "No reason provided")
            text_output += f"{metric_name}\n"
            text_output += f"  Score: {score}/10\n"
            text_output += f"  Reason: {reason}\n\n"
        overall_score = eval_data.get("Overall Score", "N/A")
        text_output += f"Overall Score: {overall_score}\n"
    else:
        text_output += "No evaluation data available for this section.\n"
    return text_output

# Load the combined_evaluations.json if not already loaded
# This assumes combined_evaluations variable exists from previous execution
# If it doesn't, you might need to re-run the previous cell or load it from file

formatted_text_output = ""

formatted_text_output += format_evaluation_to_text(combined_evaluations.get("srs_evaluation"), "SRS")
formatted_text_output += "\n" + "-"*50 + "\n\n" # Separator
formatted_text_output += format_evaluation_to_text(combined_evaluations.get("code_generation_evaluation"), "Code Generation")
formatted_text_output += "\n" + "-"*50 + "\n\n" # Separator
formatted_text_output += format_evaluation_to_text(combined_evaluations.get("test_cases_evaluation"), "Test Cases")

output_text_filename = "formatted_evaluations.txt"
with open(output_text_filename, "w") as f:
    f.write(formatted_text_output)

print(f"Formatted text output saved to {output_text_filename}")

# Display the content of the formatted text file
print("\nContent of formatted_evaluations.txt:\n")
print(formatted_text_output)


Formatted text output saved to formatted_evaluations.txt

Content of formatted_evaluations.txt:

SRS Evaluation

Requirements Completeness
  Score: 6/10
  Reason: The SRS covers the main functional and non-functional requirements, but lacks details on book management, user authentication, and error handling.

Clarity & Unambiguity
  Score: 8/10
  Reason: The requirements are generally clear and concise, but some terms like 'unique Book ID' could be clarified.

Consistency
  Score: 9/10
  Reason: The SRS is well-organized and consistent in its structure, but there is a minor inconsistency in the use of 'shall' and 'should' in the functional and non-functional requirements.

Verifiability & Testability
  Score: 7/10
  Reason: Most requirements can be objectively verified or tested, but some, like NFR1, are too vague to be tested directly.

Structure & Standard Compliance
  Score: 5/10
  Reason: The SRS does not follow the standard IEEE 830/IEEE 29148 format, and some sections (e.g., Assu

In [None]:
!pip install -q streamlit cloudflared pdfplumber


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for cloudflared (setup.py) ... [?25l[?25hdone


In [None]:
import os

os.makedirs(".streamlit", exist_ok=True)

with open(".streamlit/secrets.toml", "w") as f:
    f.write('OPENROUTER_API_KEY = "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"\n')


In [None]:
!cat .streamlit/secrets.toml


OPENROUTER_API_KEY = "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"


In [None]:
%%writefile app.py
from openai import OpenAI
import os
import streamlit as st

# =====================================================
# ---------------- LLM Client Setup -------------------
# =====================================================

# Option 1 (Recommended): Set in Colab
OPENROUTER_API_KEY = st.secrets["OPENROUTER_API_KEY"]

if not OPENROUTER_API_KEY:
    raise RuntimeError(
        "OPENROUTER_API_KEY is not set. "
        "Please set it using os.environ or Streamlit secrets."
    )

client = OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1"
)

import streamlit as st
import tempfile
import json
import re
import pdfplumber

# =====================================================
# ---------------- Utility Functions ------------------
# =====================================================

def extract_text_from_pdf(uploaded_file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        tmp.write(uploaded_file.read())
        pdf_path = tmp.name

    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()


def extract_text_file(uploaded_file):
    return uploaded_file.read().decode("utf-8").strip()


def extract_json_from_string(text):
    """
    Robust JSON extractor:
    1. Raw JSON
    2. ```json { } ```
    3. ``` { } ```
    """
    # Case 1: raw JSON
    try:
        return json.loads(text)
    except Exception:
        pass

    # Case 2: fenced ```json
    match = re.search(r'```json\s*({[\s\S]*?})\s*```', text)
    if match:
        try:
            return json.loads(match.group(1))
        except Exception:
            pass

    # Case 3: fenced ```
    match = re.search(r'```\s*({[\s\S]*?})\s*```', text)
    if match:
        try:
            return json.loads(match.group(1))
        except Exception:
            pass

    return None


def extract_srs_evaluation(text):
    """
    SRS may be valid JSON or plain text.
    """
    json_data = extract_json_from_string(text)
    if json_data:
        return json_data
    return text.strip()


def dict_to_readable_text(title, data):
    """
    Converts evaluation JSON/dict into clean, readable text.
    """
    lines = [f"{title}\n" + "=" * len(title)]

    if not isinstance(data, dict):
        lines.append(str(data))
        return "\n".join(lines)

    for key, value in data.items():
        if isinstance(value, dict):
            lines.append(f"\n{key}:")
            for sub_key, sub_val in value.items():
                lines.append(f"  {sub_key}: {sub_val}")
        else:
            lines.append(f"\n{key}: {value}")

    return "\n".join(lines)


# =====================================================
# ----------------   SRS Evaluation  ------------------
# =====================================================

SRS_METRICS = [
    {
        "name": "Requirements Completeness",
        "description": (
            "Does the SRS comprehensively cover all necessary functional and non-functional "
            "requirements, including system features, constraints, assumptions, and dependencies?"
        )
    },
    {
        "name": "Clarity & Unambiguity",
        "description": (
            "Are the requirements clearly stated, precise, and free from ambiguity, vague terms, "
            "or subjective language that could lead to multiple interpretations?"
        )
    },
    {
        "name": "Consistency",
        "description": (
            "Are there any conflicting, duplicated, or contradictory requirements within the SRS, "
            "or between functional and non-functional sections?"
        )
    },
    {
        "name": "Verifiability & Testability",
        "description": (
            "Can each requirement be objectively verified or tested through inspection, analysis, "
            "demonstration, or test cases?"
        )
    },
    {
        "name": "Structure & Standard Compliance",
        "description": (
            "Is the SRS well-structured and logically organized, following standard SRS formats "
            "(e.g., IEEE 830 / IEEE 29148) with proper sections and hierarchy?"
        )
    }
]

def build_geval_prompt_srs_pdf(srs_pdf_text, metrics):
    metric_text = "\n".join([
        f"{i+1}. {m['name']}: {m['description']}"
        for i, m in enumerate(metrics)
    ])

    return f"""
You are an expert software requirements analyst and SRS reviewer.

Evaluate the following Software Requirements Specification (SRS) document
generated by a Large Language Model. The SRS content has been extracted
from a PDF file.

SRS DOCUMENT (Extracted from PDF):
{srs_pdf_text}

EVALUATION METRICS:
{metric_text}

For each metric:
- Assign a score from 1 to 10 (higher is better).
- Provide a concise and technically sound justification.
- Evaluate strictly based on SRS best practices and standards
  (e.g., IEEE 830 / IEEE 29148).

Return STRICTLY in valid JSON format.
Do NOT include markdown, explanations, or additional text.

Expected JSON format:
{{
  "Metric Name": {{
      "score": <int>,
      "reason": "<short justification>"
  }},
  ...
  "Overall Score": <float>
}}
"""

def run_geval_srs(prompt):
    """
    Executes GEVAL evaluation for an SRS document (text extracted from PDF).

    Args:
        prompt (str): Fully constructed GEVAL prompt for SRS evaluation.

    Returns:
        str: Raw JSON response from the LLM evaluator.
    """
    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_tokens=900
    )

    return response.choices[0].message.content

# =====================================================
# ---------------- Code Generation --------------------
# =====================================================

CG_METRICS = [
    {
        "name": "Functional Correctness",
        "description": "Does the code correctly solve the problem for all valid inputs and edge cases?"
    },
    {
        "name": "Compilation / Execution Validity",
        "description": "Is the code free from syntax and runtime errors?"
    },
    {
        "name": "Algorithmic Efficiency",
        "description": "Is the time and space complexity appropriate for the problem constraints?"
    },
    {
        "name": "Readability & Maintainability",
        "description": "Is the code clean, readable, and well-structured?"
    },
    {
        "name": "Edge Case Handling",
        "description": "Does the code correctly handle boundary and corner cases?"
    }
]

def build_geval_prompt_cg(problem, code, metrics):
    metric_text = "\n".join([
        f"{i+1}. {m['name']}: {m['description']}"
        for i, m in enumerate(metrics)
    ])

    return f"""
You are an expert software engineer and code reviewer.

Evaluate the following generated code using the metrics below.

PROBLEM DESCRIPTION:
{problem}

GENERATED CODE:
{code}

EVALUATION METRICS:
{metric_text}

For each metric:
- Give a score (1-10) and short reason for each metric.
- Return ONLY valid JSON in this format:

Return STRICTLY in valid JSON format:
{{
  "Metric Name": {{
      "score": <int>,
      "reason": "<text>"
  }},
  ...
  "Overall Score": <float>
}}
"""

def run_geval_cg(prompt):
    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=700
    )
    return response.choices[0].message.content


# =====================================================
# ---------------- Testcase Evaluation ----------------
# =====================================================

TESTCASE_METRICS = [
    {
        "name": "Test Case Coverage",
        "description": (
            "Do the generated test cases adequately cover normal, boundary, and extreme input scenarios "
            "defined by the problem constraints?"
        )
    },
    {
        "name": "Edge Case Effectiveness",
        "description": (
            "Do the test cases include meaningful edge and corner cases that are likely to expose defects "
            "in incorrect or incomplete implementations?"
        )
    },
    {
        "name": "Input Validity & Constraints Compliance",
        "description": (
            "Are all test inputs valid, well-formed, and compliant with the stated input constraints "
            "(data types, ranges, formats, and sizes)?"
        )
    },
    {
        "name": "Expected Output Correctness",
        "description": (
            "Are the expected outputs accurate and logically consistent with the problem specification "
            "for each generated test case?"
        )
    },
    {
        "name": "Redundancy & Diversity",
        "description": (
            "Do the test cases avoid unnecessary duplication while maintaining sufficient diversity "
            "to test different execution paths?"
        )
    }
]


def build_geval_prompt_testcases(problem, code, testcases, metrics):
    metric_text = "\n".join([
        f"{i+1}. {m['name']}: {m['description']}"
        for i, m in enumerate(metrics)
    ])

    return f"""
You are an expert software testing engineer and quality analyst.

Your task is to evaluate the quality of test cases generated by a Large Language Model (LLM).
The goal is to assess whether these test cases are effective at validating the correctness
and robustness of the given code with respect to the problem description.

PROBLEM DESCRIPTION:
{problem}

REFERENCE CODE UNDER TEST:
{code}

GENERATED TEST CASES:
{testcases}

EVALUATION METRICS:
{metric_text}

For each metric:
- Assign a score between 1 and 10
- Provide a concise, technical justification focused on test effectiveness

Return STRICTLY valid JSON in the following format:
{{
  "Metric Name": {{
      "score": <int>,
      "reason": "<concise explanation>"
  }},
  ...
  "Overall Score": <float>
}}

Important:
- Do NOT evaluate code style or performance
- Focus ONLY on test coverage, correctness, diversity, and defect-detection capability
- Penalize redundant, invalid, or weak test cases
"""


def run_testcase_geval(prompt):
    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_tokens=700
    )

    return response.choices[0].message.content.strip()



# =====================================================
# ---------------- Streamlit UI -----------------------
# =====================================================

st.set_page_config(
    page_title="GEVAL – Evaluation System",
    layout="wide"
)

st.title("GEVAL – SRS Evaluation System")
st.caption("Streamlit app running on Google Colab")

# ---------------- DEBUG (safe to keep) ----------------
st.write("UI loaded successfully.")

# =====================================================
# ---------------- File Upload Section ----------------
# =====================================================

st.header("Upload Inputs")

srs_pdf = st.file_uploader(
    "Upload SRS Document (PDF)",
    type=["pdf"]
)

problem_file = st.file_uploader(
    "Upload Problem Description (.txt)",
    type=["txt"]
)

code_file = st.file_uploader(
    "Upload Generated Code (.java / .py / .cpp / .txt)",
    type=["java", "py", "cpp", "txt"]
)

testcase_file = st.file_uploader(
    "Upload Test Cases (.txt / .json)",
    type=["txt", "json"]
)

run_eval = st.button("Run Evaluation")

# =====================================================
# ---------------- Evaluation Pipeline ----------------
# =====================================================

if run_eval:
    combined_evaluations = {}

    # ---------------- SRS Evaluation ----------------
    if srs_pdf:
        with st.spinner("Evaluating SRS Document..."):
            srs_text = extract_text_from_pdf(srs_pdf)
            prompt_srs = build_geval_prompt_srs_pdf(
                srs_text,
                SRS_METRICS
            )
            raw_output_srs = run_geval_srs(prompt_srs)
            srs_eval = extract_srs_evaluation(raw_output_srs)

        st.subheader("SRS Evaluation Report")
        if isinstance(srs_eval, dict):
            srs_text_output = dict_to_readable_text("SRS Evaluation", srs_eval)
            st.text_area("SRS Evaluation Report", srs_text_output, height=400)
        else:
            st.text_area("SRS Evaluation", srs_eval, height=350)

        combined_evaluations["srs_evaluation"] = srs_eval
    else:
        st.warning("SRS PDF not uploaded.")

    # ---------------- Code Generation Evaluation ----------------
    if problem_file and code_file:
        with st.spinner("Evaluating Generated Code..."):
            problem_text = extract_text_file(problem_file)
            code_text = extract_text_file(code_file)

            prompt_code = build_geval_prompt_cg(
                problem_text,
                code_text,
                CG_METRICS
            )
            raw_output_cg = run_geval_cg(prompt_code)
            cg_eval = extract_json_from_string(raw_output_cg)

        st.subheader("Code Generation Evaluation Report")
        if cg_eval:
            cg_text_output = dict_to_readable_text("Code Generation Evaluation", cg_eval)
            st.text_area("Code Generation Evaluation Report", cg_text_output, height=400)
        else:
            st.warning("Invalid JSON returned for Code Evaluation.")
            st.text(raw_output_cg)

        combined_evaluations["code_generation_evaluation"] = cg_eval
    else:
        st.warning("Problem description or code file missing.")

    # ---------------- Test Case Evaluation ----------------
    if testcase_file:
        with st.spinner("Evaluating Test Cases..."):
            tc_text = extract_text_file(testcase_file)
            prompt_tc = build_geval_prompt_testcases(
                problem_text,
                code_text,
                tc_text,
                TESTCASE_METRICS
            )
            raw_output_tc = run_testcase_geval(prompt_tc)
            tc_eval = extract_json_from_string(raw_output_tc)

        st.subheader("Test Case Evaluation Report")
        if tc_eval:
            tc_text_output = dict_to_readable_text("Test Case Evaluation", tc_eval)
            st.text_area("Test Case Evaluation Report", tc_text_output, height=400)
        else:
            st.warning("Invalid JSON returned for Test Case Evaluation.")
            st.text(raw_output_tc)

        combined_evaluations["test_cases_evaluation"] = tc_eval
    else:
        st.warning("Test case file not uploaded.")

    # ---------------- Save Combined Output ----------------
    if combined_evaluations:
        final_text_output = []

        if "srs_evaluation" in combined_evaluations:
          final_text_output.append(
          dict_to_readable_text("SRS Evaluation", combined_evaluations["srs_evaluation"])
          )

        if "code_generation_evaluation" in combined_evaluations:
          final_text_output.append(
          dict_to_readable_text("Code Generation Evaluation", combined_evaluations["code_generation_evaluation"])
          )

        if "test_cases_evaluation" in combined_evaluations:
          final_text_output.append(
          dict_to_readable_text("Test Case Evaluation", combined_evaluations["test_cases_evaluation"])
          )

        final_text = "\n\n" + ("\n" + "=" * 80 + "\n\n").join(final_text_output)

        with open("combined_evaluations.txt", "w") as f:
          f.write(final_text)


        st.success("Combined evaluations saved successfully.")

        st.download_button(
          label="Download Evaluation Report (TXT)",
          data=final_text,
          file_name="combined_evaluations.txt",
          mime="text/plain"
        )



Writing app.py


In [None]:
!pkill -f streamlit


In [None]:
!nohup streamlit run app.py \
  --server.port 8501 \
  --server.address 0.0.0.0 \
  > streamlit.log 2>&1 &


In [None]:
!ps aux | grep streamlit


root        1695  0.0  0.0  20960 12960 ?        R    13:33   0:00 /usr/bin/python3 /usr/local/bin/streamlit run app.py --server.port 8501 --server.address 0.0.0.0
root        1696  0.0  0.0   7372  3404 ?        S    13:33   0:00 /bin/bash -c ps aux | grep streamlit
root        1698  0.0  0.0   6480  2376 ?        S    13:33   0:00 grep streamlit


In [None]:
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared-linux-amd64
!mv cloudflared-linux-amd64 /usr/local/bin/cloudflared


In [None]:
!cloudflared --version


cloudflared version 2025.11.1 (built 2025-11-07-16:59 UTC)


In [None]:
!cloudflared tunnel --url http://localhost:8501


[90m2026-01-10T13:33:11Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2026-01-10T13:33:11Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2026-01-10T13:33:14Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2026-01-10T13:33:14Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2026