In [None]:
import warnings
from loguru import logger


# Ignore warnings
warnings.filterwarnings('ignore')

# Ignore loguru logs from backend
logger.remove()  
logger.add(lambda msg: None, level="ERROR")

In [None]:
import sys
import os

# Add the project directory to the path
backend_path = os.path.abspath('../')
if backend_path not in sys.path:
    sys.path.append(backend_path)

from backend.causal_models.factory import CausalLMFactory
from backend.schemas import SearchResult, GenerationConfig

# Building dataset from file

In [None]:
from dataset import dataset
from PIL import Image

artificial_dataset = []

for sample in dataset:
    artificial_sample = []
    for search_result in sample:
        if search_result["image"]:
            img = Image.open(search_result["image"])
        artificial_sample.append(
            SearchResult(
                text=search_result["text"],
                document_name=search_result["document_name"],
                image=img
            )
        )
    artificial_dataset.append(artificial_sample)

In [None]:
questions = [
    "What was the trend or progression of Arctic sea ice cover in the period between 1991-2020?",
    "Which AI Model did perform best in forecasting certain weather situations?",
    "How many countries build MHEWS in the year 2020?",
    "What are the estimated ozone exposure levels in South Asia? (Provide specific numbers if possible.)",
    "Where were the most Meteorological event recorded? (Meteorological event: storm, winter storm,severe weather, hail, tornado, local storm) Give numbers if possible.",
    "Around what time of year is the ozone hole at its largest?",
    "What was the last measured sea level in mm approximately?",
    "In which year was the long-term average of the global combined surface air temperature over land and sea crossed?"
]

# Inference

## Preperation

In [None]:
generation_config = GenerationConfig(
            max_new_tokens=150,
            no_repeat_ngram_size=3,
            temperature=1,
            top_k=90,
            num_beams=1,
            do_sample=False,
            length_penalty=-0.7,
        )

In [None]:
from jinja2 import Environment, FileSystemLoader

template_dir = "./prompt_templates"

# Load the template from the file system
env = Environment(loader=FileSystemLoader(template_dir))

## Language Model

In [None]:
# Load prompt template
template = env.get_template('language_prompt.j2')

# Init model
model = CausalLMFactory.get_model("HuggingFaceH4/zephyr-7b-beta")
model.set_prompt_template(template)

In [None]:
# Save results
language_results = []

# Inference loop
for question, sample in zip(questions, artificial_dataset):
    context = "\n".join([s.text for s in sample])
    answer = model.generate(question, sample, **generation_config.dict())
    language_results.append(
        {
            "question": question,
            "answer": answer,
            "context": context
        }
    ) 

In [None]:
import pandas as pd

results = pd.DataFrame(language_results)
results.to_csv("language_results.csv")
results.head()

In [None]:
import torch

# Clean up
del model
torch.cuda.empty_cache()

## Multimodal Model

In [None]:
# Load prompt template
template = env.get_template('multimodal_prompt.j2')

# Init model
model = CausalLMFactory.get_model("HuggingFaceM4/idefics2-8b-chatty")
model.set_prompt_template(template)

In [None]:
# Save results
multimodal_results = []

# Inference loop
for question, sample in zip(questions, artificial_dataset):
    context = "\n".join([s.text for s in sample])
    answer = model.generate(question, sample, **generation_config.dict())
    multimodal_results.append(
        {
            "question": question,
            "answer": answer,
            "context": context
        }
    ) 

In [None]:
results = pd.DataFrame(multimodal_results)
results.to_csv("multimodal_results.csv")
results.head()

# Evaluation

In [None]:
from langchain_openai import ChatOpenAI 
from langchain.evaluation import load_evaluator
import os

open_ai_auth = os.getenv("OPEN_AI")

os.environ["OPENAI_API_KEY"] = "SECRET"

evaluation_llm = ChatOpenAI(model="gpt-4o")
evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=evaluation_llm,requires_reference=True)

## Language Model

In [None]:
language_results = pd.read_csv("language_results.csv")

In [None]:
gpt_language_evaluation = []

for (i, (_, question, answer, context)) in language_results.iterrows():
    eval_result = evaluator.evaluate_strings(
                prediction=answer,
                input=question,
                reference=context
            )
    gpt_language_evaluation.append(eval_result)

In [None]:
res = pd.DataFrame(gpt_language_evaluation)
res.to_csv("gpt_evaluation.csv")
res.head()