In [1]:
import mlflow
import os
import pandas as pd

In [2]:
os.chdir('..')

In [3]:
ls

Readme.md         [30m[43mdata[m[m/             [30m[43mnotebooks[m[m/
[30m[43mapp[m[m/              [34mmlruns[m[m/           requirements.txt


In [4]:
client = mlflow.tracking.MlflowClient()
# Cargamos experimentos que comiencen con "eval_"
experiments = [
    exp for exp in client.search_experiments() if exp.name.startswith("report_summary")
]

if not experiments:
   print("No hay experimentos disponibles.")
if experiments:
    exp_names = [exp.name for exp in experiments]
    print("Experimentos disponibles:")
    for i, exp in enumerate(exp_names):
        print(f"{i + 1}. {exp}")

Experimentos disponibles:
1. report_summary_slms_vs_gpt4_1_reference


In [38]:
experiment = client.get_experiment_by_name(exp_names[0])
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=[
        "start_time DESC",
    ],
)

if not runs:
    print("No hay ejecuciones registradas.")
else:
    print(f"Se encontraron {len(runs)} ejecuciones registradas.")
    # Recolectamos datos de cada run
    data = []
    for run in runs:
        params = run.data.params
        metrics = run.data.metrics
        artifacts = client.list_artifacts(run.info.run_id)
        list_artifacts = [artifact for artifact in artifacts]
        dict_metrics = {
            #'run_ID': run.info.run_id,
            "video_id": params.get("video_id"),
            "channel_name": params.get("channel_name"),
            "prompt_version": params.get("prompt_version"),
            "model": params.get("llm_model"),
            # Métricas de evaluación
            "criterial_score": metrics.get("criterial_score", None),
            "embedding_cosine_distance": metrics.get("embedding_cosine_distance", None),
            "score": metrics.get("score", None),
        }
        data.append(dict_metrics)

    # Creamos un dataframe con todos los datos
    df = pd.DataFrame(data)


Se encontraron 442 ejecuciones registradas.


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   video_id                   442 non-null    object 
 1   channel_name               442 non-null    object 
 2   prompt_version             442 non-null    object 
 3   model                      442 non-null    object 
 4   criterial_score            442 non-null    float64
 5   embedding_cosine_distance  442 non-null    float64
 6   score                      442 non-null    float64
dtypes: float64(3), object(4)
memory usage: 24.3+ KB


In [40]:
test_dataset_path = 'data/slm_summaries/test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert.csv'
df_test = pd.read_csv(test_dataset_path,sep=";")

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   video_id      221 non-null    object
 1   channel_name  221 non-null    object
 2   prompt        221 non-null    object
 3   text          221 non-null    object
 4   summary       221 non-null    object
 5   slm_prompt    221 non-null    object
 6   slm_summary   221 non-null    object
dtypes: object(7)
memory usage: 12.2+ KB


In [41]:
import tiktoken


# Crear el codificador para llama-3.2 (usa 'cl100k_base' como aproximación)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Contar tokens en la columna 'slm_summary'
df_test["slm_tokens"] = df_test["slm_prompt"].apply(lambda x: len(tokenizer.encode(x)))
sel_columns =['video_id', 'channel_name','slm_tokens']
df_test_filter = df_test[sel_columns]

In [42]:
df_joined = pd.merge(df, df_test_filter, on=["channel_name", "video_id"], how="inner", suffixes=('', '_test'))
df_joined.info()
df_joined.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   video_id                   442 non-null    object 
 1   channel_name               442 non-null    object 
 2   prompt_version             442 non-null    object 
 3   model                      442 non-null    object 
 4   criterial_score            442 non-null    float64
 5   embedding_cosine_distance  442 non-null    float64
 6   score                      442 non-null    float64
 7   slm_tokens                 442 non-null    int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 27.8+ KB


Unnamed: 0,video_id,channel_name,prompt_version,model,criterial_score,embedding_cosine_distance,score,slm_tokens
0,QEzWdecJPKM,USACRYPTONOTICIAS,v3_summary_expert,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,1.0,0.104214,7.0,816
1,oi9z9YkeUZ8,USACRYPTONOTICIAS,v3_summary_expert,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,0.0,0.208968,2.0,5117
2,F4-oXv3oB9w,USACRYPTONOTICIAS,v3_summary_expert,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,0.0,0.140089,2.0,21970
3,84zFrrHaBCw,USACRYPTONOTICIAS,v3_summary_expert,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,0.0,0.167937,5.0,2216
4,kE4PHBzjK9w,USACRYPTONOTICIAS,v3_summary_expert,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,1.0,0.112821,7.0,1604


In [58]:
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df.drop(columns=["video_id", "channel_name"])
    .groupby(["model", "prompt_version"])
    .agg(['mean']) #.agg(['min', 'max' , 'sum', 'median','mean'])
    .reset_index()
)

df_grouped_1

Unnamed: 0_level_0,model,prompt_version,criterial_score,embedding_cosine_distance,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean
0,llama3_2_3b_instruct_fp16,v3_summary_expert,0.027149,0.4282,2.099548
1,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,v3_summary_expert,0.140271,0.224712,3.330317


In [64]:
contex = 4096 # block_size=8192 or max_seq_length: 4096
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]<contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
    .agg(['count','mean'])
    .reset_index()
)

df_grouped_1

Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,count,mean,count,mean
0,llama3_2_3b_instruct_fp16,v3_summary_expert,119,0.05042,119,0.380516,119,2.756303
1,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,v3_summary_expert,119,0.201681,119,0.192671,119,4.268908


In [65]:
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]>=contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
    .agg(['count','mean'])
    .reset_index()
)

df_grouped_1

Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,count,mean,count,mean
0,llama3_2_3b_instruct_fp16,v3_summary_expert,102,0.0,102,0.483832,102,1.333333
1,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_...,v3_summary_expert,102,0.068627,102,0.262093,102,2.235294
