In [None]:
#https://python.langchain.com/docs/concepts/evaluation/
#https://docs.smith.langchain.com/evaluation/how_to_guides/use_langchain_off_the_shelf_evaluators_old#use-string-or-embedding-distance-metrics
#https://docs.smith.langchain.com/reference/sdk_reference/langchain_evaluators

In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir('..')
from app.llm.run_evaluation import EvaluateModels

In [3]:
load_dotenv()

True

In [5]:
test_dataset_path = 'data/slm_summaries/test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert.csv'


In [6]:
file_name = test_dataset_path.split('/')[-1].replace('.csv', '')
model_name = file_name.removeprefix('test_slm_').split('_v')[0]
prompt_version = "v"+file_name.removeprefix('test_slm_').split('_v')[-1]
file_name,model_name,prompt_version

('test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert',
 'llama3_2_3b_instruct_fp16',
 'v3_summary_expert')

In [7]:
df_test = pd.read_csv(test_dataset_path,sep=";")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   video_id      221 non-null    object
 1   channel_name  221 non-null    object
 2   prompt        221 non-null    object
 3   text          221 non-null    object
 4   summary       221 non-null    object
 5   slm_prompt    221 non-null    object
 6   slm_summary   221 non-null    object
dtypes: object(7)
memory usage: 12.2+ KB


In [8]:
index =1

# Metodos de evaluacion de respuestas modelo

In [38]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("qa")
evaluator.evaluate_strings(
    prediction=df_test["slm_summary"][index],
    input=df_test["slm_prompt"][index],
    reference=df_test["summary"][index],
)

{'reasoning': 'INCORRECT', 'value': 'INCORRECT', 'score': 0}

In [31]:
from langchain_openai import OpenAIEmbeddings

open_ia_embedding_model = OpenAIEmbeddings(
    openai_api_key=os.environ.get("OPENAI_API_KEY"),
    model= 'text-embedding-3-large',#"text-embedding-ada-002" #text-embedding-3-large
)

In [34]:
from langchain.evaluation.embedding_distance import EmbeddingDistanceEvalChain,EmbeddingDistance
chain = EmbeddingDistanceEvalChain(embeddings=open_ia_embedding_model,distance_metric=EmbeddingDistance.COSINE)
result = chain.evaluate_strings(prediction=df_test["slm_summary"][index],
                                reference=df_test["summary"][index])
print(result)

{'score': 0.5147789587583859}


In [39]:
from langchain_openai import ChatOpenAI
from langchain.evaluation.criteria import LabeledCriteriaEvalChain
llm = ChatOpenAI(model="gpt-4o", temperature=0)
#criteria = "correctness"
criteria = {"correctness":"Is the submission correct, accurate, and factual?.If so, respond Y. If not, respond N.",
            "conciseness":"Is the submission concise and to the point?. If so, respond Y. If not, respond N.",
            "coherence":"Is the submission coherent, well-structured, and organized?. If so, respond Y. If not, respond N.",
            "detail":"Does the submission demonstrate attention to detail?. If so, respond Y. If not, respond N.",
            "hallucination":"Does the submission contain any hallucinations or inaccuracies?. If so, respond Y. If not, respond N."
            }

evaluator = LabeledCriteriaEvalChain.from_llm(
    llm=llm,
    criteria=criteria,
)
evaluator.evaluate_strings(
  prediction=df_test["slm_summary"][index],
  input=df_test["slm_prompt"][index],
  reference=df_test["summary"][index],
  )

{'reasoning': "To evaluate the submission against the criteria, let's analyze each one:\n\n1. **Correctness**: The submission incorrectly states that the provided text is not related to investments or finance, which is inaccurate. The text is indeed about financial markets, investment strategies, and specific assets. Therefore, the submission is not correct or factual.\n\n2. **Conciseness**: The submission is concise and to the point, as it quickly addresses the perceived issue with the input text. However, it fails to address the task requirements, which affects its overall relevance.\n\n3. **Coherence**: The submission is coherent in its structure and organization, as it clearly communicates its misunderstanding of the task. However, it does not follow the requested format for the analysis, which affects its coherence in the context of the task.\n\n4. **Detail**: The submission lacks attention to detail because it does not engage with the content of the input text or provide the stru

In [9]:
from langchain_openai import ChatOpenAI
from langchain.evaluation.criteria import LabeledCriteriaEvalChain
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
#criteria = "correctness"
# criteria = {"correctness":"Is the submission correct, accurate, and factual?.If so, respond Y. If not, respond N.",
#             "conciseness":"Is the submission concise and to the point?. If so, respond Y. If not, respond N.",
#             "coherence":"Is the submission coherent, well-structured, and organized?. If so, respond Y. If not, respond N.",
#             "detail":"Does the submission demonstrate attention to detail?. If so, respond Y. If not, respond N.",
#             "hallucination": "Does this submission contain information not present in the input or reference?. If so, respond Y. If not, respond N."
#             }
criteria = {
            "faithfulness": "Is the summary accurate and consistent with the source text, without hallucinations or fabricated facts? If yes, respond Y. If no, respond N.",
            "relevance": "Does the summary include the main information from the source text and omit insignificant details? If yes, respond Y. If no, respond N.",
            "conciseness": "Is the summary concise and free of unnecessary repetition or verbosity? If yes, respond Y. If no, respond N.",
            "coherence": "Is the summary well-structured, clear, and easy to follow? If yes, respond Y. If no, respond N.",
        }

evaluator = LabeledCriteriaEvalChain.from_llm(
    llm=llm,
    criteria=criteria,
)
evaluation = evaluator.evaluate_strings(
  prediction=df_test["slm_summary"][index],
  input=df_test["prompt"][index],
  reference=df_test["summary"][index],
  )

2025-04-22 17:32:24,496 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [11]:
razonamiento = evaluation.get("reasoning", "No reasoning")
valor = evaluation.get("value", "NaN")
score = evaluation.get("score", 0)

valor,score,razonamiento

('N',
 0,
 "1. **Faithfulness**: The submission states that the provided text is not a response to a specific question or topic related to investments or finance, but rather a YouTube content piece sharing thoughts and opinions. This is inaccurate because the text does indeed contain specific investment analysis and insights, particularly regarding Warren Buffett's actions and the performance of various markets. Therefore, the submission does not accurately reflect the content of the source text.\n\n2. **Relevance**: The submission fails to summarize or extract any of the main information from the source text. Instead, it dismisses the content as irrelevant, which is not the case. The source text contains valuable insights for investors, including market performance, specific stock recommendations, and analysis of investment strategies. Thus, the submission does not include the main information and omits significant details.\n\n3. **Conciseness**: The submission is concise in its respo

In [None]:
from langchain.evaluation.scoring import ScoreStringEvalChain

chain = ScoreStringEvalChain.from_llm(llm=llm)
result = chain.evaluate_strings(
    input = "What is the chemical formula for water?",
    prediction = "H2O",
    reference = "The chemical formula for water is H2O.",
)
result


This chain was only tested with GPT-4. Performance may be significantly worse with other models.
To use a reference, use the LabeledScoreStringEvalChain instead. (EvaluatorType.LABELED_SCORE_STRING) instead.


{'reasoning': 'The response provided by the AI assistant is concise and directly answers the question. The chemical formula for water is indeed "H2O," which is a well-known and universally accepted fact. The response is helpful and relevant to the question, as it provides the correct chemical formula without any unnecessary information. While the answer is brief, it is appropriate given the straightforward nature of the question. There is no need for additional depth in this context, as the question does not require further explanation or elaboration. Overall, the response is correct, accurate, and factual.\n\nRating: [[10]]',
 'score': 10}

In [None]:
result = chain.evaluate_strings(
    input=df_test["prompt"][index],
    prediction=df_test["slm_summary"][index],
    reference=df_test["summary"][index],
)
result



{'score': 0.5167421905677069}

# registrar las evaluaciones con MLflow

In [7]:
evaluate_slms = EvaluateModels(llm_evaluator_name = "gpt-4.1",  model_embedding_name = "text-embedding-3-large")

In [8]:
import warnings

warnings.filterwarnings('ignore')

In [9]:
test_dataset_path = 'data/slm_summaries/test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert.csv'
file_name = test_dataset_path.split('/')[-1].replace('.csv', '')
model_name = file_name.removeprefix('test_slm_').split('_v')[0]
prompt_version = "v"+file_name.removeprefix('test_slm_').split('_v')[-1]
file_name,model_name,prompt_version

('test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert',
 'llama3_2_3b_instruct_fp16',
 'v3_summary_expert')

In [10]:
evaluate_slms.run_evaluation(
    experiment_name= "report_summary_slms_vs_gpt4_1_reference",
    dataset_path = test_dataset_path,
    model_name = model_name,
    prompt_version = prompt_version
)

2025-04-22 17:42:07,114 - INFO - 📊 Dataset cargado: data/slm_summaries/test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert.csv
2025/04/22 17:42:07 INFO mlflow.tracking.fluent: Experiment with name 'report_summary_slms_vs_gpt4_1_reference' does not exist. Creating a new experiment.


📊 Experimento MLflow: report_summary_slms_vs_gpt4_1_reference


2025-04-22 17:42:14,939 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-22 17:42:16,130 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-22 17:42:20,365 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

Tu tarea es analizar el siguiente te...' (16521 characters) is truncated to 6000 characters to meet the length limit.
2025-04-22 17:42:20,386 - INFO - [1] OK: Labeled:{'reasoning': 'Let\'s evaluate the submission against each criterion step by step:\n\n**1. Faithfulness:**  \n- The submission does not fabricate facts or hallucinate information. It provides general advice about evaluating investments and risk, which is broadly consistent with the cautionary tone of the source text.\n- However, it does not accurately summarize the main points of the source text. The source text is a detailed, critical analysis of the myths around living from trading, with spec

In [11]:
test_dataset_path = 'data/slm_summaries/test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert.csv'
file_name = test_dataset_path.split('/')[-1].replace('.csv', '')
model_name = file_name.removeprefix('test_slm_').split('_v')[0]
prompt_version = "v"+file_name.removeprefix('test_slm_').split('_v')[-1]
file_name,model_name,prompt_version

('test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert',
 'llama3_2_3b_instruct_fp16',
 'v3_summary_expert')

In [26]:
df = pd.read_csv(test_dataset_path,sep=";")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   video_id      221 non-null    object
 1   channel_name  221 non-null    object
 2   prompt        221 non-null    object
 3   text          221 non-null    object
 4   summary       221 non-null    object
 5   slm_prompt    221 non-null    object
 6   slm_summary   221 non-null    object
dtypes: object(7)
memory usage: 12.2+ KB


In [30]:
import tiktoken

# Crear el codificador para gpt-4o
tokenizer = tiktoken.encoding_for_model("gpt-4o")

# Contar tokens en la columna 'slm_summary'
df["text_tokens"] = df["text"].apply(lambda x: len(tokenizer.encode(x)))

# Calcular el percentil 75
p95 = df["text_tokens"].quantile(0.90)

# Filtrar filas donde el número de tokens es mayor al percentil 75
df_p95 = df[df["text_tokens"] < p95]

df_p95.info()

<class 'pandas.core.frame.DataFrame'>
Index: 198 entries, 0 to 220
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   video_id      198 non-null    object
 1   channel_name  198 non-null    object
 2   prompt        198 non-null    object
 3   text          198 non-null    object
 4   summary       198 non-null    object
 5   slm_prompt    198 non-null    object
 6   slm_summary   198 non-null    object
 7   text_tokens   198 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 13.9+ KB


In [31]:
df[['text_tokens']].describe()

Unnamed: 0,text_tokens
count,221.0
mean,6432.665158
std,6669.016202
min,36.0
25%,2052.0
50%,3094.0
75%,10215.0
max,27452.0


In [32]:
df_p95[['text_tokens']].describe()

Unnamed: 0,text_tokens
count,198.0
mean,4734.818182
std,4600.600926
min,36.0
25%,1996.75
50%,2925.0
75%,4841.25
max,18342.0
