# Importing Dependencies

In [12]:
import os
import json
import time

from ollama import chat

In [13]:
prompt_technique = 'chain_of_thought'
generator_model = 'gemini'
evaluator_model = 'deepseek-v2:16b'

# Load Results

In [14]:
with open(f"../data/{generator_model}/{prompt_technique}/first_level_subfeatures.json", "r", encoding="utf-8") as file:
    first_level_subfeatures = json.load(file)

with open(f"../data/{generator_model}/{prompt_technique}/second_level_subfeatures.json", "r", encoding="utf-8") as file:
    second_level_subfeatures = json.load(file)

# Evaluation Criteria

In [15]:
with open(f"../data/eval_criteria.json", "r", encoding="utf-8") as file:
    eval_criteria = json.load(file)

# Functions

In [16]:
system_prompt = """
You are an expert in mobile app development and requirements engineering.  
Your primary responsibility is to **critically evaluate** the refinement of software features, ensuring that the breakdown of high-level features into sub-features is **logical, precise, relevant, and technically feasible**.  

Your evaluation must adhere **strictly** to the following criteria, applying **a rigorous standard** in your assessment:  

### **Evaluation Criteria**

1. **Feature Relationship Classification**:
   - Classify each sub-feature in relation to the root feature using one of the following:
     - **Sub-feature**: A feature that is directly derived from and dependent on the root feature.
     - **Sibling feature**: A feature at the same hierarchical level as the root feature.
     - **Super feature**: A feature that is broader or encompasses the root feature.
     - **Identical feature**: A feature that is essentially the same as the root feature.
     - **Other**: A feature that does not fit into the above categories.

**Be precise in classification. Misclassification should be flagged.** 

-----

2. **Relevance (Rating: 1 to 5)**:
   - Evaluate how relevant the sub-feature is to the root feature:
     - 5: Highly relevant and a natural extension.
     - 4: Mostly relevant and logically connected.
     - 3: Moderately relevant but might not serve the same purpose.
     - 2: Somewhat relevant, mainly because it belongs to the same app category.
     - 1: Not relevant at all.

**A score of 4 or 5 requires a clear justification. A score of 1 or 2 must explain why the sub-feature does not align.**  

-----

3. **Clarity (Rating: 1 to 5)**:
   - Assess the clarity and understandability of the sub-feature description:
     - 5: Very clear and easily understandable.
     - 4: Mostly clear with minor syntax issues.
     - 3: Somewhat clear but contains ambiguities or is too lengthy.
     - 2: Mostly unclear and difficult to understand.
     - 1: Very unclear or irrelevant.

**If the description lacks sufficient detail or is ambiguous, clarity should not exceed 3.**

-----

4. **Feasibility (Rating: 1 to 5)**:
   - Evaluate how practical and implementable the sub-feature is:
     - 5: Feasible and commonly implemented in existing apps.
     - 4: Feasible but lacks clear real-world examples.
     - 3: Probably feasible but has some uncertainties.
     - 2: Probably not feasible due to technical limitations.
     - 1: Not feasible at all.

**If feasibility is uncertain, assume a **cautious** stance. Avoid overestimating feasibility.**

-----

**Instructions**:
- Provide a structured evaluation for each sub-feature.
- **Use the exact format** provided below for your response. 
- **Do not provide justifications** or explanations for the ratings or classifications.
"""

In [17]:
def get_prompt(feature: str, feature_description: str, super_feature: str, super_feature_description: str,) -> str:
    return f"""
Given the following mobile app feature and its refined sub-feature, evaluate the sub-feature based on the provided system guidelines.

**Feature**
```
feature: {super_feature}
description: {super_feature_description}
```

**Sub-feature to Evaluate**
```
sub_feature: {feature}
description: {feature_description}
```

Provide a structured evaluation of this sub-feature, based on provided criteria.

Return the evaluation as a structured JSON object in the following format:
{{
    "relationship": "Sub-feature | Sibling feature | Super feature | Identical feature | Other",
    "relevance": Rating (1-5),
    "clarity": Rating (1-5),
    "feasibility": Rating (1-5)
}}
"""

# Level 1 Feature Evaluation

In [18]:
first_level_eval = []

for super_feature in first_level_subfeatures:
    for feature in super_feature["sub_features"]:
        prompt = get_prompt(
            feature["sub_feature"], 
            feature["description"],
            super_feature["feature"],
            super_feature["description"]
        )

        response = chat(model=evaluator_model, messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": prompt,
            },
        ])

        first_level_eval.append({
            "super_feature": super_feature["feature"],
            "super_feature_description": super_feature["description"],
            "feature": feature["sub_feature"],
            "description": feature["description"],
            "feature_eval": json.loads((response["message"]["content"]).replace('```', '').replace('json', ''))
        })


In [19]:
# Salvar os resultados em um arquivo JSON
with open(f"../data/deepseek/{prompt_technique}/first_level_eval.json", "w") as f:
    json.dump(first_level_eval, f, indent=4)

# Level 2 Feature Evaluation

In [20]:
second_level_eval = []

for super_feature in second_level_subfeatures:
    for feature in super_feature["sub_features"]:
        prompt = get_prompt(
            feature["sub_feature"], 
            feature["description"],
            super_feature["feature"],
            super_feature["description"]
        )

        response = chat(model=evaluator_model, messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": prompt,
            },
        ])

        second_level_eval.append({
            "root_feture": super_feature["super_feature"],
            "root_feature_description": super_feature["super_feature_description"],
            "super_feature": super_feature["feature"],
            "super_feature_description": super_feature["description"],
            "feature": feature["sub_feature"],
            "description": feature["description"],
            "feature_eval": json.loads((response["message"]["content"]).replace('```', '').replace('json', ''))
        })


In [21]:
# Salvar os resultados em um arquivo JSON
with open(f"../data/deepseek/{prompt_technique}/second_level_eval.json", "w") as f:
    json.dump(second_level_eval, f, indent=4)