In [1]:
import google.generativeai as genai
from secret import GEMINI_API_KEY
genai.configure(api_key=GEMINI_API_KEY)
from deepeval.models.base_model import DeepEvalBaseLLM
import time

In [2]:
safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

In [3]:
class Gemini(DeepEvalBaseLLM):
    """Class to implement Vertex AI for DeepEval"""
    def __init__(self, model):
        self.model = genai.GenerativeModel(model)

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        summary_model = self.load_model()
        return summary_model.generate_content(
            contents=prompt,
            generation_config={'candidate_count': 1, 'temperature': 0.0}, 
            safety_settings=safety_settings
        ).text

    async def a_generate(self, prompt: str) -> str:
        summary_model = self.load_model()
        res = await summary_model.generate_content(
            contents=prompt,
            generation_config={'candidate_count': 1, 'temperature': 0.0}, 
            safety_settings=safety_settings
        )
        return res.text

    def get_model_name(self):
        return "Gemini AI Model"

In [4]:
custom_model_gemini = Gemini(
    model="gemini-1.5-flash-latest"
)

In [5]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
import json

In [6]:
consistency_metric = GEval(
    name="Consistency",
    criteria = "the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts.",
    evaluation_steps=[
        "Read the news article carefully and identify the main facts and details it presents.",
        "Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.",
        "Assign a score for consistency based on the Evaluation Criteria."
    ],
    model=Gemini('gemini-1.5-flash-latest'),
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    async_mode=False
)

In [7]:
src_dir = 'data/cnndm_sumllm/gpt4/train.jsonl'
store_dir = 'data/cnndm_sumllm/gpt4/gemini-consistency_results.jsonl'

In [None]:
with open(src_dir, 'r') as f:
    lines = f.readlines()
    for (i, line) in enumerate(lines):
        data = json.loads(line)
        test_case = LLMTestCase(
            input=data['article'],
            actual_output=data['abstract']
        )
        time.sleep(3)
        try:
            consistency_metric.measure(test_case=test_case)
        except Exception as e:
            print("wait for 10 seconds")
            time.sleep(10)
            try:
                consistency_metric.measure(test_case=test_case)
            except Exception as e:
                print("wait for 20 seconds")
                time.sleep(20)
                try:
                    consistency_metric.measure(test_case=test_case)
                except Exception as e:
                    print('the load is too high, please try again later')
                    break
        with open(store_dir, 'a') as f2:
            f2.write(json.dumps({
                'score': consistency_metric.score,
                'reason': consistency_metric.reason
            }) + '\n')
        if i==10:
            break