In [9]:
import os
import time
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import ollama

# Function to select the best tokenizer based on the chunk size
def select_best_tokenizer(chunk_size):
    if chunk_size == 128:
        return AutoTokenizer.from_pretrained("bert-base-uncased"), AutoModel.from_pretrained("bert-base-uncased")
    elif chunk_size == 512:
        return AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2"), AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")
    else:
        # Using Longformer for larger chunks (1024)
        return AutoTokenizer.from_pretrained("allenai/longformer-base-4096"), AutoModel.from_pretrained("allenai/longformer-base-4096")

def split_text_into_chunks(text, tokenizer, max_length):
    """
    Split text into chunks of max_length tokens, recursively splitting chunks 
    that are still too long.
    """
    inputs = tokenizer(text, return_tensors='np', truncation=False, add_special_tokens=False)
    input_ids = inputs['input_ids'][0]

    # If the text fits within the max_length, return it as a single chunk
    if len(input_ids) <= max_length:
        return [text]

    chunks = []
    for i in range(0, len(input_ids), max_length):
        chunk_ids = input_ids[i:i + max_length]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        
        # Check if the chunk is still too long
        if len(tokenizer(chunk_text)['input_ids']) > max_length:
            # Recursively split the chunk
            sub_chunks = split_text_into_chunks(chunk_text, tokenizer, max_length)
            chunks.extend(sub_chunks)
        else:
            chunks.append(chunk_text)
    
    return chunks

def compute_embeddings(texts, tokenizer, model, max_length):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        with torch.no_grad():
            embedding = model(**inputs).last_hidden_state.mean(dim=1)
            embeddings.append(embedding)
    return torch.mean(torch.stack(embeddings), dim=0)

def tag_text(text, model_name):
    response = ollama.chat(
        model=model_name,
        messages=[{
            "role": "user",
            "content": f"""
            You are a professional tagger. Your task is to analyze a given text and return highly relevant tags to the main topics and themes of the text. 

            Guidelines:
            1. Only provide the tags, nothing else.
            2. Each tag must be a single word, not a phrase.
            3. Separate the tags with commas, without spaces or additional formatting.

            Example Input:
            "Artificial intelligence is transforming industries like healthcare, finance, and transportation."

            Example Output:
            ai,technology,automation

            Now, generate tags for the following text:
            "{text}"
            """
        }]
    )
    if isinstance(response, dict):
        tags = response.get('message', {}).get('content', "error,generating,tags").split(',')
    else:
        tags = ["error", "generating", "tags"]
    return [tag.strip() for tag in tags]

def save_tags_to_csv(output_path, tags):
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['tag_name', 'cosine_similarity'])
        for tag, similarity in tags:
            csv_writer.writerow([tag, f"{similarity:.4f}"])

def process_file(file_path, model_name, output_folder, tokenizer, model, chunk_size, iterations):
    """Process the file, generate tags, and save the best tags."""
    with open(file_path, 'r', encoding='utf-8') as f:
        original_text = f.read()

    # Split the text into chunks
    chunks = split_text_into_chunks(original_text, tokenizer, max_length=chunk_size)

    # Compute the embeddings of the entire text (average embedding across all chunks)
    text_embedding = compute_embeddings(chunks, tokenizer, model, max_length=chunk_size)

    chunk_folder = os.path.join(output_folder, f"chunk_{chunk_size}")
    os.makedirs(chunk_folder, exist_ok=True)

    all_tags_per_iteration = []

    for i in range(iterations):
        tags = []
        tag_similarities = {}

        for chunk in chunks:
            chunk_tags = tag_text(chunk, model_name)
            tags.extend(chunk_tags)

        # Remove duplicates and calculate cosine similarity
        for tag in set(tags):
            tag_embedding = compute_embeddings([tag], tokenizer, model, max_length=chunk_size)
            similarity = cosine_similarity(
                text_embedding.cpu().numpy(),
                tag_embedding.cpu().numpy()
            )[0][0]  # Scalar similarity value

            if tag not in tag_similarities or similarity > tag_similarities[tag]:
                tag_similarities[tag] = similarity

        # Save the tags for this iteration
        sorted_tags = sorted(tag_similarities.items(), key=lambda x: x[1], reverse=True)
        all_tags_per_iteration.append(sorted_tags)

        output_file = os.path.join(
            chunk_folder,
            f"{os.path.splitext(os.path.basename(file_path))[0]}_{i+1}.csv"
        )
        save_tags_to_csv(output_file, sorted_tags)

    # Save the "best tags" file as a copy of the first iteration
    best_tags_file = os.path.join(
        chunk_folder,
        f"{os.path.splitext(os.path.basename(file_path))[0]}_best_tags.csv"
    )
    os.rename(output_file, best_tags_file)

def initialize_environment(chunk_size):
    tokenizer, model = select_best_tokenizer(chunk_size)
    return tokenizer, model

def read_file_paths(input_file):
    with open(input_file, 'r') as f:
        return [line.strip().replace('.mp3', '.txt') for line in f]

def create_output_folder(model_name):
    safe_model_name = model_name.replace(":", "_").replace("<", "_").replace(">", "_")
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    folder_name = f"{safe_model_name}_{timestamp}"
    output_folder = os.path.join("voiceapp", "output", "tagging_folder", folder_name)
    os.makedirs(output_folder, exist_ok=True)
    return output_folder

def main():
    input_file = 'D:\\Ai\\Audio-Classifier\\voiceapp\\lista-1.txt'
    models_list = [        
        # 'mistral:7b',
        'mistral-small',
        # 'llama3.2:3b',
        # 'qwen2:7b',
        # 'yi',
        #'glm4:9b',
        # 'qwen2.5:7b',
        # 'qwen2.5:72b'
    ]

    file_paths = read_file_paths(input_file)
    for model_name in tqdm(models_list, desc="Processing models", unit="model"):
        output_folder = create_output_folder(model_name)
        for chunk_size in [128, 512, 1024]:
            tokenizer, model = initialize_environment(chunk_size)
            for file_path in tqdm(file_paths, desc=f"Model {model_name} | Chunk {chunk_size}", unit="file"):
                if os.path.exists(file_path):
                    process_file(file_path, model_name, output_folder, tokenizer, model, chunk_size, iterations=5)
                else:
                    print(f"File not found: {file_path}")

if __name__ == "__main__":
    main()


Processing models:   0%|                                                | 0/1 [00:00<?, ?model/s]
Model mistral-small | Chunk 128:   0%|                                  | 0/97 [00:00<?, ?file/s][AToken indices sequence length is longer than the specified maximum sequence length for this model (2888 > 512). Running this sequence through the model will result in indexing errors

Model mistral-small | Chunk 128:   1%|▏                       | 1/97 [00:46<1:14:05, 46.31s/file][A
Model mistral-small | Chunk 128:   2%|▍                       | 2/97 [01:27<1:08:17, 43.13s/file][A
Model mistral-small | Chunk 128:   3%|▋                       | 3/97 [02:14<1:10:29, 44.99s/file][A
Model mistral-small | Chunk 128:   4%|▉                       | 4/97 [02:59<1:09:53, 45.10s/file][A
Model mistral-small | Chunk 128:   5%|█▏                      | 5/97 [03:54<1:14:37, 48.67s/file][A
Model mistral-small | Chunk 128:   6%|█▍                      | 6/97 [04:47<1:16:06, 50.18s/file][A
Model mistra

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - dĹ‚ugi 1.txt



Model mistral-small | Chunk 128:  60%|██████████████▉          | 58/97 [51:33<12:36, 19.39s/file][A
Model mistral-small | Chunk 128:  61%|███████████████▏         | 59/97 [51:53<12:17, 19.40s/file][A
Model mistral-small | Chunk 128:  62%|███████████████▍         | 60/97 [52:08<11:22, 18.44s/file][A
Model mistral-small | Chunk 128:  63%|███████████████▋         | 61/97 [52:22<10:13, 17.03s/file][A
Model mistral-small | Chunk 128:  64%|███████████████▉         | 62/97 [52:40<10:11, 17.46s/file][A
Model mistral-small | Chunk 128:  65%|████████████████▏        | 63/97 [52:55<09:29, 16.74s/file][A
Model mistral-small | Chunk 128:  66%|████████████████▍        | 64/97 [53:25<11:19, 20.59s/file][A
Model mistral-small | Chunk 128:  67%|████████████████▊        | 65/97 [53:42<10:27, 19.60s/file][A
Model mistral-small | Chunk 128:  68%|█████████████████        | 66/97 [53:57<09:22, 18.14s/file][A
Model mistral-small | Chunk 128:  69%|█████████████████▎       | 67/97 [54:15<08:59, 17.99

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - los klasy Ĺ›redniej 1.txt



Model mistral-small | Chunk 128:  75%|██████████████████▊      | 73/97 [55:48<05:29, 13.74s/file][A

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - mieszkania dla mĹ‚odych 1.txt



Model mistral-small | Chunk 128:  77%|███████████████████▎     | 75/97 [56:08<04:32, 12.36s/file][A
Model mistral-small | Chunk 128:  78%|███████████████████▌     | 76/97 [56:25<04:42, 13.44s/file][A
Model mistral-small | Chunk 128:  79%|███████████████████▊     | 77/97 [56:42<04:44, 14.23s/file][A
Model mistral-small | Chunk 128:  80%|████████████████████     | 78/97 [56:59<04:44, 14.99s/file][A
Model mistral-small | Chunk 128:  81%|████████████████████▎    | 79/97 [57:18<04:49, 16.07s/file][A
Model mistral-small | Chunk 128:  82%|████████████████████▌    | 80/97 [57:33<04:25, 15.64s/file][A
Model mistral-small | Chunk 128:  84%|████████████████████▉    | 81/97 [57:52<04:28, 16.76s/file][A
Model mistral-small | Chunk 128:  85%|█████████████████████▏   | 82/97 [58:10<04:15, 17.00s/file][A
Model mistral-small | Chunk 128:  86%|█████████████████████▍   | 83/97 [58:25<03:51, 16.55s/file][A
Model mistral-small | Chunk 128:  87%|█████████████████████▋   | 84/97 [58:49<04:02, 18.63

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - upadek bankĂłw 1.txt



Model mistral-small | Chunk 128:  91%|██████████████████████▋  | 88/97 [59:45<02:11, 14.62s/file][A
Model mistral-small | Chunk 128:  92%|█████████████████████  | 89/97 [1:00:05<02:06, 15.85s/file][A
Model mistral-small | Chunk 128:  93%|█████████████████████▎ | 90/97 [1:00:16<01:42, 14.70s/file][A
Model mistral-small | Chunk 128:  94%|█████████████████████▌ | 91/97 [1:00:32<01:30, 15.14s/file][A

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel cz1.txt
File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel po zimie 1.txt



Model mistral-small | Chunk 128:  97%|██████████████████████▎| 94/97 [1:00:59<00:35, 11.75s/file][A
Model mistral-small | Chunk 128:  98%|██████████████████████▌| 95/97 [1:01:16<00:25, 12.75s/file][A
Model mistral-small | Chunk 128: 100%|███████████████████████| 97/97 [1:01:54<00:00, 38.29s/file][A


File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - Ĺ‚apĂłwki 1.txt



Model mistral-small | Chunk 512:   0%|                                  | 0/97 [00:00<?, ?file/s][AToken indices sequence length is longer than the specified maximum sequence length for this model (2888 > 512). Running this sequence through the model will result in indexing errors

Model mistral-small | Chunk 512:   1%|▎                         | 1/97 [00:25<40:06, 25.06s/file][A
Model mistral-small | Chunk 512:   2%|▌                         | 2/97 [00:45<35:05, 22.16s/file][A
Model mistral-small | Chunk 512:   3%|▊                         | 3/97 [01:08<35:51, 22.89s/file][A
Model mistral-small | Chunk 512:   4%|█                         | 4/97 [01:31<35:16, 22.76s/file][A
Model mistral-small | Chunk 512:   5%|█▎                        | 5/97 [01:59<37:54, 24.72s/file][A
Model mistral-small | Chunk 512:   6%|█▌                        | 6/97 [02:25<37:49, 24.94s/file][A
Model mistral-small | Chunk 512:   7%|█▉                        | 7/97 [02:53<39:06, 26.07s/file][A
Model mi

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - dĹ‚ugi 1.txt



Model mistral-small | Chunk 512:  60%|██████████████▉          | 58/97 [25:33<05:58,  9.20s/file][A
Model mistral-small | Chunk 512:  61%|███████████████▏         | 59/97 [25:43<06:03,  9.57s/file][A
Model mistral-small | Chunk 512:  62%|███████████████▍         | 60/97 [25:51<05:33,  9.01s/file][A
Model mistral-small | Chunk 512:  63%|███████████████▋         | 61/97 [25:57<04:59,  8.32s/file][A
Model mistral-small | Chunk 512:  64%|███████████████▉         | 62/97 [26:07<05:08,  8.81s/file][A
Model mistral-small | Chunk 512:  65%|████████████████▏        | 63/97 [26:16<04:54,  8.66s/file][A
Model mistral-small | Chunk 512:  66%|████████████████▍        | 64/97 [26:30<05:40, 10.30s/file][A
Model mistral-small | Chunk 512:  67%|████████████████▊        | 65/97 [26:38<05:03,  9.49s/file][A
Model mistral-small | Chunk 512:  68%|█████████████████        | 66/97 [26:45<04:38,  9.00s/file][A
Model mistral-small | Chunk 512:  69%|█████████████████▎       | 67/97 [26:55<04:32,  9.07

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - los klasy Ĺ›redniej 1.txt



Model mistral-small | Chunk 512:  75%|██████████████████▊      | 73/97 [27:41<02:43,  6.81s/file][A

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - mieszkania dla mĹ‚odych 1.txt



Model mistral-small | Chunk 512:  77%|███████████████████▎     | 75/97 [27:51<02:14,  6.11s/file][A
Model mistral-small | Chunk 512:  78%|███████████████████▌     | 76/97 [28:00<02:22,  6.79s/file][A
Model mistral-small | Chunk 512:  79%|███████████████████▊     | 77/97 [28:08<02:21,  7.05s/file][A
Model mistral-small | Chunk 512:  80%|████████████████████     | 78/97 [28:16<02:18,  7.31s/file][A
Model mistral-small | Chunk 512:  81%|████████████████████▎    | 79/97 [28:25<02:18,  7.72s/file][A
Model mistral-small | Chunk 512:  82%|████████████████████▌    | 80/97 [28:32<02:07,  7.48s/file][A
Model mistral-small | Chunk 512:  84%|████████████████████▉    | 81/97 [28:42<02:09,  8.07s/file][A
Model mistral-small | Chunk 512:  85%|█████████████████████▏   | 82/97 [28:51<02:08,  8.59s/file][A
Model mistral-small | Chunk 512:  86%|█████████████████████▍   | 83/97 [29:01<02:02,  8.78s/file][A
Model mistral-small | Chunk 512:  87%|█████████████████████▋   | 84/97 [29:12<02:02,  9.46

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - upadek bankĂłw 1.txt



Model mistral-small | Chunk 512:  91%|██████████████████████▋  | 88/97 [29:43<01:11,  7.90s/file][A
Model mistral-small | Chunk 512:  92%|██████████████████████▉  | 89/97 [29:53<01:07,  8.48s/file][A
Model mistral-small | Chunk 512:  93%|███████████████████████▏ | 90/97 [30:00<00:56,  8.06s/file][A
Model mistral-small | Chunk 512:  94%|███████████████████████▍ | 91/97 [30:09<00:49,  8.18s/file][A

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel cz1.txt
File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel po zimie 1.txt



Model mistral-small | Chunk 512:  97%|████████████████████████▏| 94/97 [30:23<00:19,  6.39s/file][A
Model mistral-small | Chunk 512:  98%|████████████████████████▍| 95/97 [30:32<00:13,  6.83s/file][A
Model mistral-small | Chunk 512: 100%|█████████████████████████| 97/97 [30:49<00:00, 19.07s/file][A


File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - Ĺ‚apĂłwki 1.txt



Model mistral-small | Chunk 1024:   0%|                                 | 0/97 [00:00<?, ?file/s][A
Model mistral-small | Chunk 1024:   1%|▏                      | 1/97 [01:14<1:59:23, 74.62s/file][A
Model mistral-small | Chunk 1024:   2%|▍                      | 2/97 [02:19<1:49:09, 68.94s/file][A
Model mistral-small | Chunk 1024:   3%|▋                      | 3/97 [03:42<1:57:53, 75.25s/file][A
Model mistral-small | Chunk 1024:   4%|▉                      | 4/97 [04:46<1:49:37, 70.73s/file][A
Model mistral-small | Chunk 1024:   5%|█▏                     | 5/97 [06:05<1:52:58, 73.68s/file][A
Model mistral-small | Chunk 1024:   6%|█▍                     | 6/97 [07:09<1:46:56, 70.51s/file][A
Model mistral-small | Chunk 1024:   7%|█▋                     | 7/97 [08:39<1:55:14, 76.83s/file][A
Model mistral-small | Chunk 1024:   8%|█▉                     | 8/97 [09:57<1:54:26, 77.15s/file][A
Model mistral-small | Chunk 1024:   9%|██▏                    | 9/97 [11:18<1:55:16, 78.60

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - dĹ‚ugi 1.txt



Model mistral-small | Chunk 1024:  60%|█████████████▏        | 58/97 [1:09:34<19:34, 30.13s/file][A
Model mistral-small | Chunk 1024:  61%|█████████████▍        | 59/97 [1:10:08<19:45, 31.18s/file][A
Model mistral-small | Chunk 1024:  62%|█████████████▌        | 60/97 [1:10:32<18:01, 29.22s/file][A
Model mistral-small | Chunk 1024:  63%|█████████████▊        | 61/97 [1:10:51<15:52, 26.46s/file][A
Model mistral-small | Chunk 1024:  64%|██████████████        | 62/97 [1:11:26<16:48, 28.82s/file][A
Model mistral-small | Chunk 1024:  65%|██████████████▎       | 63/97 [1:11:52<15:52, 28.01s/file][A
Model mistral-small | Chunk 1024:  66%|██████████████▌       | 64/97 [1:12:36<17:53, 32.53s/file][A
Model mistral-small | Chunk 1024:  67%|██████████████▋       | 65/97 [1:13:01<16:16, 30.51s/file][A
Model mistral-small | Chunk 1024:  68%|██████████████▉       | 66/97 [1:13:26<14:55, 28.88s/file][A
Model mistral-small | Chunk 1024:  69%|███████████████▏      | 67/97 [1:14:03<15:32, 31.09

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - los klasy Ĺ›redniej 1.txt



Model mistral-small | Chunk 1024:  75%|████████████████▌     | 73/97 [1:16:35<09:09, 22.89s/file][A

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - mieszkania dla mĹ‚odych 1.txt



Model mistral-small | Chunk 1024:  77%|█████████████████     | 75/97 [1:17:07<07:23, 20.17s/file][A
Model mistral-small | Chunk 1024:  78%|█████████████████▏    | 76/97 [1:17:35<07:38, 21.84s/file][A
Model mistral-small | Chunk 1024:  79%|█████████████████▍    | 77/97 [1:17:57<07:19, 21.95s/file][A
Model mistral-small | Chunk 1024:  80%|█████████████████▋    | 78/97 [1:18:34<08:09, 25.77s/file][A
Model mistral-small | Chunk 1024:  81%|█████████████████▉    | 79/97 [1:19:01<07:47, 25.99s/file][A
Model mistral-small | Chunk 1024:  82%|██████████████████▏   | 80/97 [1:19:17<06:34, 23.21s/file][A
Model mistral-small | Chunk 1024:  84%|██████████████████▎   | 81/97 [1:20:00<07:39, 28.74s/file][A
Model mistral-small | Chunk 1024:  85%|██████████████████▌   | 82/97 [1:20:34<07:36, 30.45s/file][A
Model mistral-small | Chunk 1024:  86%|██████████████████▊   | 83/97 [1:21:01<06:52, 29.43s/file][A
Model mistral-small | Chunk 1024:  87%|███████████████████   | 84/97 [1:21:38<06:49, 31.52

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - upadek bankĂłw 1.txt



Model mistral-small | Chunk 1024:  91%|███████████████████▉  | 88/97 [1:23:30<04:11, 27.98s/file][A
Model mistral-small | Chunk 1024:  92%|████████████████████▏ | 89/97 [1:24:09<04:05, 30.63s/file][A
Model mistral-small | Chunk 1024:  93%|████████████████████▍ | 90/97 [1:24:35<03:25, 29.41s/file][A
Model mistral-small | Chunk 1024:  94%|████████████████████▋ | 91/97 [1:25:11<03:08, 31.43s/file][A

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel cz1.txt
File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel po zimie 1.txt



Model mistral-small | Chunk 1024:  97%|█████████████████████▎| 94/97 [1:26:05<01:12, 24.13s/file][A
Model mistral-small | Chunk 1024:  98%|█████████████████████▌| 95/97 [1:26:38<00:51, 25.92s/file][A
Model mistral-small | Chunk 1024: 100%|██████████████████████| 97/97 [1:27:34<00:00, 54.17s/file][A
Processing models: 100%|███████████████████████████████████| 1/1 [3:00:20<00:00, 10820.60s/model]

File not found: C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - Ĺ‚apĂłwki 1.txt





In [4]:
def main():
    # Initialize variables
    input_file = 'D:\\Ai\\Audio-Classifier\\voiceapp\\lista-1.txt'
    models_list = [        
        # 'mistral:7b',
        # 'mistral-small',
        #  'llama3.2:3b',
        # 'qwen2:7b',
        # 'yi',
        'glm4:9b',
        # 'qwen2.5:7b',
        # 'qwen2.5:72b'
    ]
    output_folders, tokenizer, model = initialize_environment(models_list)

    # Read file paths
    file_paths = read_file_paths(input_file)

    # Initialize summary tracking
    summary_data = []

    # Process each model sequentially
    for model_name in models_list:
        print(f"\nProcessing all files for model: {model_name}")
        pbar = tqdm(total=len(file_paths), desc=f"Processing {model_name}", unit="file")
        
        for file_path in file_paths:
            output_folder = output_folders[model_name]
            best_tags_file = process_file(file_path, model_name, output_folder, tokenizer, model, iterations=5)
            summary_data.append({
                "Original File Path": file_path,
                "Model": model_name,
                "Tags File Path": best_tags_file
            })

            pbar.update(1)

        pbar.close()

    # Generate summary CSV
    summary_csv_path = 'output_summary.csv'
    generate_summary_csv(summary_csv_path, summary_data)


if __name__ == "__main__":
    main()

ValueError: not enough values to unpack (expected 3, got 2)

In [18]:
def main():
    reference_file = "reference.txt"  # Optional reference file for metrics
    
    # Initialize tokenizer and model
    tokenizer, model = initialize_environment(["sentence-transformers/all-distilroberta-v1"])
    
    # Process each file
    tag_files = []
    for file_path in input_files:
        best_tags_file, _ = process_file_with_metrics(file_path, "model_name", output_folder, tokenizer, model, reference_file=reference_file)
        tag_files.append(best_tags_file)
    
    # Generate summary CSV
    generate_summary_csv(output_folder, input_files, tag_files)

if __name__ == "__main__":
    main()


ValueError: too many values to unpack (expected 2)