In [48]:
import mlflow
import os
import pandas as pd

In [2]:
os.chdir('..')

In [3]:
ls

Readme.md         image-2.png       image-6.png       requirements.txt
[30m[43mapp[m[m/              image-3.png       image.png
[30m[43mdata[m[m/             image-4.png       [34mmlruns[m[m/
image-1.png       image-5.png       [30m[43mnotebooks[m[m/


In [68]:
client = mlflow.tracking.MlflowClient()
# Cargamos experimentos que comiencen con "eval_"
experiments = [
    exp for exp in client.search_experiments() if exp.name.startswith("report_summary")
]

if not experiments:
   print("No hay experimentos disponibles.")
if experiments:
    exp_names = [exp.name for exp in experiments]
    print("Experimentos disponibles:")
    for i, exp in enumerate(exp_names):
        print(f"{i + 1}. {exp}")

Experimentos disponibles:
1. report_summary_slms_vs_gpt4_1_reference


In [69]:
experiment = client.get_experiment_by_name(exp_names[0])
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=[
        "start_time DESC",
    ],
    max_results=2500,
)

if not runs:
    print("No hay ejecuciones registradas.")
else:
    print(f"Se encontraron {len(runs)} ejecuciones registradas.")
    # Recolectamos datos de cada run
    data = []
    for run in runs:
        params = run.data.params
        metrics = run.data.metrics
        artifacts = client.list_artifacts(run.info.run_id)
        list_artifacts = [artifact for artifact in artifacts]
        dict_metrics = {
            #'run_ID': run.info.run_id,
            "video_id": params.get("video_id"),
            "channel_name": params.get("channel_name"),
            "prompt_version": params.get("prompt_version"),
            "model": params.get("llm_model"),
            # Métricas de evaluación
            "criterial_score": metrics.get("criterial_score", None),
            "embedding_cosine_distance": metrics.get("embedding_cosine_distance", None),
            "score": metrics.get("score", None),
        }
        data.append(dict_metrics)

    # Creamos un dataframe con todos los datos
    df = pd.DataFrame(data)


Se encontraron 2436 ejecuciones registradas.


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2436 entries, 0 to 2435
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   video_id                   2432 non-null   object 
 1   channel_name               2432 non-null   object 
 2   prompt_version             2432 non-null   object 
 3   model                      2432 non-null   object 
 4   criterial_score            2432 non-null   float64
 5   embedding_cosine_distance  2432 non-null   float64
 6   score                      2432 non-null   float64
dtypes: float64(3), object(4)
memory usage: 133.3+ KB


In [71]:
test_dataset_path = 'data/slm_summaries/test_slm_llama3_2_3b_instruct_fp16_v3_summary_expert.csv'
df_test = pd.read_csv(test_dataset_path,sep=";")

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   video_id      221 non-null    object
 1   channel_name  221 non-null    object
 2   prompt        221 non-null    object
 3   text          221 non-null    object
 4   summary       221 non-null    object
 5   slm_prompt    221 non-null    object
 6   slm_summary   221 non-null    object
dtypes: object(7)
memory usage: 12.2+ KB


In [72]:
import tiktoken


# Crear el codificador para llama-3.2 (usa 'cl100k_base' como aproximación)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Contar tokens en la columna 'slm_summary'
df_test["slm_tokens"] = df_test["text"].apply(lambda x: len(tokenizer.encode(x)))
sel_columns =['video_id', 'channel_name','slm_tokens']
df_test_filter = df_test[sel_columns]

In [73]:
df_joined = pd.merge(df, df_test_filter, on=["channel_name", "video_id"], how="inner", suffixes=('', '_test'))
df_joined['slm_tokens'].describe()

count     2432.000000
mean      7410.210526
std       7621.480315
min         38.000000
25%       2381.000000
50%       3661.000000
75%      11824.000000
max      31540.000000
Name: slm_tokens, dtype: float64

In [74]:
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df.drop(columns=["video_id", "channel_name"])
    .groupby(["model", "prompt_version"])
    #.agg(['mean','median','std','min','max'])
    .agg(['mean','std'])
    .reset_index()
)

# Ordenar de mayor a menor por el score medio
df_grouped_1 = df_grouped_1.sort_values(('score', 'mean'), ascending=False)

# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std
0,gpt_4o_mini,v3_summary_expert,0.60181,0.490636,0.100755,0.031121,8.479638,0.951454
10,unsloth_Meta_Llama_3_1_8B_Instruct_bnb_4bit_gguf_Q8_0,v3_summary_expert,0.361991,0.481667,0.166664,0.06355,5.674208,2.272907
9,phi4_latest,v3_summary_expert,0.171946,0.37819,0.197469,0.074781,4.506787,2.313242
2,llama3_1_8b_instruct_fp16,v1_summary_expert_one_shot,0.081448,0.274143,0.202654,0.073806,4.325792,2.063253
8,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,0.140271,0.348057,0.224712,0.101414,3.330317,2.057013
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,0.130631,0.337758,0.232396,0.111087,3.216216,2.08379
4,llama3_2_3b_instruct_fp16,v1_summary_expert_one_shot,0.027149,0.162887,0.225847,0.062263,3.208145,1.615866
7,llama_3_2_3B_Instruct_finetuned_bnb_nf4_dq_gguf_bf16_gguf_BF16,v3_summary_expert,0.090498,0.287545,0.246029,0.14478,3.076923,2.057903
3,llama3_1_8b_instruct_fp16,v3_summary_expert,0.049774,0.217971,0.38336,0.265185,2.873303,1.952216
6,llama3_2_3b_instruct_fp16,v3_summary_expert,0.027149,0.162887,0.4282,0.216365,2.099548,1.670343


In [75]:
contex = 3661 # 50% de text block_size=8192 or max_seq_length: 4096
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
print(f"Filtrando por Chunk Size menor a {contex}")
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]<=contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
    #.agg(['mean','median','std','min','max'])
    .agg(['mean','std'])
    .reset_index()
)
df_grouped_1 = df_grouped_1.sort_values(('score', 'mean'), ascending=False)
# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Filtrando por Chunk Size menor a 3661


Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std
0,gpt_4o_mini,v3_summary_expert,0.702703,0.459141,0.084842,0.024657,8.774775,0.59825
10,unsloth_Meta_Llama_3_1_8B_Instruct_bnb_4bit_gguf_Q8_0,v3_summary_expert,0.504505,0.502247,0.13461,0.054517,6.972973,1.904347
9,phi4_latest,v3_summary_expert,0.297297,0.459141,0.162437,0.069192,5.702703,2.399222
2,llama3_1_8b_instruct_fp16,v1_summary_expert_one_shot,0.162162,0.370271,0.176965,0.081065,5.315315,2.211466
8,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,0.216216,0.41353,0.188573,0.103758,4.378378,2.24845
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,0.225225,0.419625,0.183678,0.08969,4.243243,2.37493
4,llama3_2_3b_instruct_fp16,v1_summary_expert_one_shot,0.054054,0.22715,0.203421,0.056505,3.90991,1.871069
7,llama_3_2_3B_Instruct_finetuned_bnb_nf4_dq_gguf_bf16_gguf_BF16,v3_summary_expert,0.144144,0.352829,0.214369,0.13561,3.900901,2.426951
3,llama3_1_8b_instruct_fp16,v3_summary_expert,0.09009,0.287609,0.314948,0.250493,3.756757,2.265216
6,llama3_2_3b_instruct_fp16,v3_summary_expert,0.054054,0.22715,0.371499,0.226478,2.855856,2.021911


In [79]:
contex = 11824 # block_size=8192 or max_seq_length: 4096
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
print(f"Filtrando por Chunk Size menor a {contex}")
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]<=contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
     #.agg(['mean','median','std','min','max'])
    .agg(['mean','std'])
    .reset_index()
)
df_grouped_1 = df_grouped_1.sort_values(('score', 'mean'), ascending=False)
# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Filtrando por Chunk Size menor a 11824


Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std
0,gpt_4o_mini,v3_summary_expert,0.644578,0.480089,0.091969,0.028167,8.668675,0.664134
10,unsloth_Meta_Llama_3_1_8B_Instruct_bnb_4bit_gguf_Q8_0,v3_summary_expert,0.421687,0.495323,0.152463,0.059531,6.295181,2.11565
9,phi4_latest,v3_summary_expert,0.222892,0.417445,0.182883,0.07219,5.084337,2.29996
2,llama3_1_8b_instruct_fp16,v1_summary_expert_one_shot,0.108434,0.311868,0.19308,0.076589,4.753012,2.118996
8,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,0.156627,0.364548,0.206393,0.099414,3.789157,2.117203
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,0.173653,0.37995,0.214896,0.105946,3.646707,2.203715
4,llama3_2_3b_instruct_fp16,v1_summary_expert_one_shot,0.036145,0.187215,0.216847,0.059995,3.518072,1.722306
7,llama_3_2_3B_Instruct_finetuned_bnb_nf4_dq_gguf_bf16_gguf_BF16,v3_summary_expert,0.114458,0.31933,0.223846,0.126848,3.5,2.173532
3,llama3_1_8b_instruct_fp16,v3_summary_expert,0.066265,0.249497,0.324389,0.240661,3.301205,2.028494
6,llama3_2_3b_instruct_fp16,v3_summary_expert,0.036145,0.187215,0.415536,0.22569,2.385542,1.821297


In [77]:
contex = 6000 # valor maximo promedio de los ultimos años sin USACRIPTOMONEDAS
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
print(f"Filtrando por Chunk Size menor a {contex}")
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]<=contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
    #.agg(['mean','median','std','min','max'])
    .agg(['mean','std'])
    .reset_index()
)
df_grouped_1 = df_grouped_1.sort_values(('score', 'mean'), ascending=False)
# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Filtrando por Chunk Size menor a 6000


Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std
0,gpt_4o_mini,v3_summary_expert,0.642857,0.480721,0.088485,0.024992,8.688312,0.661939
10,unsloth_Meta_Llama_3_1_8B_Instruct_bnb_4bit_gguf_Q8_0,v3_summary_expert,0.422078,0.495502,0.148719,0.056783,6.350649,2.119108
9,phi4_latest,v3_summary_expert,0.233766,0.424606,0.177593,0.069875,5.227273,2.302379
2,llama3_1_8b_instruct_fp16,v1_summary_expert_one_shot,0.116883,0.322329,0.188707,0.075984,4.831169,2.153596
8,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,0.162338,0.369963,0.202799,0.099172,3.88961,2.113021
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,0.167742,0.374848,0.209754,0.105069,3.716129,2.232449
4,llama3_2_3b_instruct_fp16,v1_summary_expert_one_shot,0.038961,0.194133,0.214699,0.05921,3.590909,1.758689
7,llama_3_2_3B_Instruct_finetuned_bnb_nf4_dq_gguf_bf16_gguf_BF16,v3_summary_expert,0.123377,0.329942,0.223411,0.129895,3.584416,2.227872
3,llama3_1_8b_instruct_fp16,v3_summary_expert,0.064935,0.247215,0.312951,0.232394,3.396104,2.056136
6,llama3_2_3b_instruct_fp16,v3_summary_expert,0.038961,0.194133,0.404055,0.224903,2.480519,1.854938


In [78]:
# Filtrar y agrupar dataset por Chunk Size y Prompt, sacar promedio del resto de columnas
# Agrupar por 'model' y 'prompt_version' y calcular estadísticas agregadas
df_grouped_1 = (
    df_joined[df_joined["slm_tokens"]>contex].drop(columns=["video_id", "channel_name","slm_tokens"])
    .groupby(["model", "prompt_version"])
    #.agg(['mean','median','std','min','max'])
    .agg(['mean','std'])
    .reset_index()
)
df_grouped_1 = df_grouped_1.sort_values(('score', 'mean'), ascending=False)
# Para mostrar el contenido completo del DataFrame sin truncar columnas o filas
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(df_grouped_1)

Unnamed: 0_level_0,model,prompt_version,criterial_score,criterial_score,embedding_cosine_distance,embedding_cosine_distance,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std
0,gpt_4o_mini,v3_summary_expert,0.507463,0.503718,0.128956,0.024935,8.0,1.290994
10,unsloth_Meta_Llama_3_1_8B_Instruct_bnb_4bit_gguf_Q8_0,v3_summary_expert,0.223881,0.419989,0.207909,0.059236,4.119403,1.813438
2,llama3_1_8b_instruct_fp16,v1_summary_expert_one_shot,0.0,0.0,0.234713,0.057242,3.164179,1.213522
9,phi4_latest,v3_summary_expert,0.029851,0.17146,0.243156,0.065433,2.850746,1.246252
4,llama3_2_3b_instruct_fp16,v1_summary_expert_one_shot,0.0,0.0,0.25147,0.061958,2.328358,0.636945
1,hf_llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_latest,v3_summary_expert,0.044776,0.208373,0.284777,0.107578,2.059701,0.982893
8,llama_3_2_3b_finetuned_qlora_bnb_nf42_gguf_q8_0_latest,v3_summary_expert,0.089552,0.287694,0.275078,0.088247,2.044776,1.160366
7,llama_3_2_3B_Instruct_finetuned_bnb_nf4_dq_gguf_bf16_gguf_BF16,v3_summary_expert,0.014925,0.122169,0.298016,0.163767,1.910448,0.792605
3,llama3_1_8b_instruct_fp16,v3_summary_expert,0.014925,0.122169,0.545195,0.266566,1.671642,0.894225
6,llama3_2_3b_instruct_fp16,v3_summary_expert,0.0,0.0,0.483698,0.185209,1.223881,0.454636
