In [1]:
import mlflow
import os
import pandas as pd

In [2]:
os.chdir('..')

In [3]:
ls

Readme.md         [30m[43mdata[m[m/             [34mmlruns[m[m/           requirements.txt
[30m[43mapp[m[m/              image.png         [30m[43mnotebooks[m[m/


In [5]:
client = mlflow.tracking.MlflowClient()
# Cargamos experimentos que comiencen con "eval_"
experiments = [
    exp for exp in client.search_experiments() if exp.name.startswith("report_summary")
]

if not experiments:
   print("No hay experimentos disponibles.")
if experiments:
    exp_names = [exp.name for exp in experiments]
    print("Experimentos disponibles:")
    for i, exp in enumerate(exp_names):
        print(f"{i + 1}. {exp}")

Experimentos disponibles:
1. report_summary_slms_vs_gpt4_1_reference


In [21]:
experiment = client.get_experiment_by_name(exp_names[0])
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=[
        "start_time DESC",
    ],
    max_results=2000,
)

if not runs:
    print("No hay ejecuciones registradas.")
else:
    print(f"Se encontraron {len(runs)} ejecuciones registradas.")
    # Recolectamos datos de cada run
    data = []
    for run in runs:
        params = run.data.params
        metrics = run.data.metrics
        artifacts = client.list_artifacts(run.info.run_id)
        list_artifacts = [artifact for artifact in artifacts]
        dict_metrics = {
            #'run_ID': run.info.run_id,
            "video_id": params.get("video_id"),
            "channel_name": params.get("channel_name"),
            "prompt_version": params.get("prompt_version"),
            "model": params.get("llm_model"),
            # Métricas de evaluación
            "criterial_score": metrics.get("criterial_score", None),
            "embedding_cosine_distance": metrics.get("embedding_cosine_distance", None),
            "score": metrics.get("score", None),
        }
        data.append(dict_metrics)

    # Creamos un dataframe con todos los datos
    df = pd.DataFrame(data)


Se encontraron 1108 ejecuciones registradas.


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108 entries, 0 to 1107
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   video_id                   1106 non-null   object 
 1   channel_name               1106 non-null   object 
 2   prompt_version             1106 non-null   object 
 3   model                      1106 non-null   object 
 4   criterial_score            1106 non-null   float64
 5   embedding_cosine_distance  1106 non-null   float64
 6   score                      1106 non-null   float64
dtypes: float64(3), object(4)
memory usage: 60.7+ KB


In [23]:
test_dataset_path = 'data/slm_summaries/test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert.csv'
df_test = pd.read_csv(test_dataset_path,sep=";")

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   video_id      221 non-null    object
 1   channel_name  221 non-null    object
 2   prompt        221 non-null    object
 3   text          221 non-null    object
 4   summary       221 non-null    object
 5   slm_prompt    221 non-null    object
 6   slm_summary   221 non-null    object
dtypes: object(7)
memory usage: 12.2+ KB


In [24]:
import tiktoken


# Crear el codificador para llama-3.2 (usa 'cl100k_base' como aproximación)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Contar tokens en la columna 'slm_summary'
df_test["slm_tokens"] = df_test["text"].apply(lambda x: len(tokenizer.encode(x)))
sel_columns =['video_id', 'channel_name','slm_tokens']
df_test_filter = df_test[sel_columns]

In [25]:
df_joined = pd.merge(df, df_test_filter, on=["channel_name", "video_id"], how="inner", suffixes=('', '_test'))
df_joined.info()
df_joined.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106 entries, 0 to 1105
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   video_id                   1106 non-null   object 
 1   channel_name               1106 non-null   object 
 2   prompt_version             1106 non-null   object 
 3   model                      1106 non-null   object 
 4   criterial_score            1106 non-null   float64
 5   embedding_cosine_distance  1106 non-null   float64
 6   score                      1106 non-null   float64
 7   slm_tokens                 1106 non-null   int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 69.3+ KB


Unnamed: 0,video_id,channel_name,prompt_version,model,criterial_score,embedding_cosine_distance,score,slm_tokens
0,QEzWdecJPKM,USACRYPTONOTICIAS,v3_summary_expert,gpt_4o_mini,1.0,0.092966,9.0,649
1,oi9z9YkeUZ8,USACRYPTONOTICIAS,v3_summary_expert,gpt_4o_mini,1.0,0.09171,9.0,4950
2,F4-oXv3oB9w,USACRYPTONOTICIAS,v3_summary_expert,gpt_4o_mini,1.0,0.088294,8.0,21803
3,84zFrrHaBCw,USACRYPTONOTICIAS,v3_summary_expert,gpt_4o_mini,1.0,0.096579,9.0,2049
4,kE4PHBzjK9w,USACRYPTONOTICIAS,v3_summary_expert,gpt_4o_mini,1.0,0.095885,9.0,1437


In [27]:
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df.drop(columns=["video_id", "channel_name"])
    .groupby(["model", "prompt_version"])
    .agg(['count','mean']) #.agg(['min', 'max' , 'sum', 'median','mean'])
    .reset_index()
)

# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,count,mean,count,mean
0,gpt_4o_mini,v3_summary_expert,221,0.60181,221,0.100755,221,8.479638
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,222,0.130631,222,0.232396,222,3.216216
2,llama3_2_3b_instruct_fp16,v2_summary_expert_one_shot,221,0.0181,221,0.30976,221,1.642534
3,llama3_2_3b_instruct_fp16,v3_summary_expert,221,0.027149,221,0.4282,221,2.099548
4,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,221,0.140271,221,0.224712,221,3.330317


In [33]:
contex = 4096 # block_size=8192 or max_seq_length: 4096
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]<=contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
    .agg(['mean'])
    .reset_index()
)

# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Unnamed: 0_level_0,model,prompt_version,criterial_score,embedding_cosine_distance,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean
0,gpt_4o_mini,v3_summary_expert,0.685484,0.08698,8.733871
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,0.208,0.198556,4.0
2,llama3_2_3b_instruct_fp16,v2_summary_expert_one_shot,0.032258,0.28216,1.967742
3,llama3_2_3b_instruct_fp16,v3_summary_expert,0.048387,0.384256,2.701613
4,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,0.193548,0.194879,4.209677


In [None]:
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]>contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
    .agg(['mean'])
    .reset_index()
)

# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Unnamed: 0_level_0,model,prompt_version,criterial_score,embedding_cosine_distance,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean
0,gpt_4o_mini,v3_summary_expert,0.494845,0.118363,8.154639
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,0.030928,0.276004,2.206186
2,llama3_2_3b_instruct_fp16,v2_summary_expert_one_shot,0.0,0.345043,1.226804
3,llama3_2_3b_instruct_fp16,v3_summary_expert,0.0,0.484376,1.329897
4,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,0.072165,0.262849,2.206186
