# Generar Queries para evaluar y entrenar con Mistral

Para generar las queries de evaluación y entrenamiento, utilizamos Mistral-7b

En base a un considerando de una consulta se pide que se genere una consulta que deberia retornar este considerando



In [30]:
from __future__ import annotations

import logging
import os
import re
from functools import lru_cache, partial, wraps
from typing import Tuple

import openai
from dotenv import load_dotenv

load_dotenv()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# https://api.together.xyz/settings/api-keys
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")

client = openai.OpenAI(
    api_key=TOGETHER_API_KEY,
    base_url="https://api.together.xyz",
)


In [31]:
BASE_PROMPT = {
    "role":"system",
    "content":"""You are an intelligent query generator. Your task is to create a detailed and relevant search query in Spanish based on the given paragraph. 
    The query should be more than just a keyword; it should reflect the main themes or ideas of the paragraph in a way that, when used in a semantic search engine, 
    would likely return the given paragraph as a top result. 
    -Based on the following paragraph, create a search query in Spanish that captures the essence of this text.
    -Respond only with the query.
    -The response should not have more than 100 words
    -Don't give explanations
    -Never elaborate
"""
}
CONTENT_PROMPT = """Paragraph: {parrafo}
query:"""
    
queries = partial(
    client.chat.completions.create,
    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
    max_tokens=1024,
)

def cache_with_logging(maxsize=1024) -> callable:
    "like lru_cache but logs cache hits"

    def decorator(func):
        cached_func = lru_cache(maxsize=maxsize)(func)

        @wraps(func)
        def wrapper(*args, **kwargs):
            hits_before = cached_func.cache_info().hits
            result = cached_func(*args, **kwargs)
            hits_after = cached_func.cache_info().hits
            if hits_after > hits_before:
                logger.info(f"Cache hit for args: {args}, kwargs: {kwargs}")
            return result

        return wrapper

    return decorator

# use maxsize = number of query, context pairs or None for unlimited (not in production)
@cache_with_logging(maxsize=None)
def gen_query(paragraph:str) -> str:
    response = queries(
        messages=[
            BASE_PROMPT,
            {
                "role": "user",
                "content": CONTENT_PROMPT.format(parrafo=paragraph),
            },
        ]
    )
    return response.choices[0].message.content


In [5]:
import pandas as pd

In [6]:
df = pd.read_json('../../../../golden_dataset.jsonl', lines=True, encoding='utf-8')

In [7]:
df2 = df.sample(100)

In [None]:
%%time
import json
output_json_file = './queries_mixtral.jsonl'

for i, row in df2.iterrows():
    completition = gen_query(row.text)
    respuesta = {
        'id':row.CRR_DOCUMENTO_ID,
        'query':completition,
        'text':row.text,
    }
    print(row.CRR_DOCUMENTO_ID)
    with open(output_json_file, "a") as file:
        json.dump(respuesta, file)
        file.write("\n")

