In [2]:
import os
import time
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import ollama

# Global variable for iterations
iterations = 5

# Function to select the best tokenizer based on the chunk size
def select_best_tokenizer(chunk_size):
    if chunk_size == 128:
        return AutoTokenizer.from_pretrained("bert-base-uncased"), AutoModel.from_pretrained("bert-base-uncased")
    elif chunk_size == 512:
        return AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2"), AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")
    else:
        return AutoTokenizer.from_pretrained("allenai/longformer-base-4096"), AutoModel.from_pretrained("allenai/longformer-base-4096")

def generate_summary_csv(output_path, summary_data):
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=["Original File Path", "Model", "Best Tags File Path"])
        csv_writer.writeheader()
        for entry in summary_data:
            csv_writer.writerow(entry)
    print(f"Summary CSV generated at {output_path}")

def compute_embeddings(texts, tokenizer, model, max_length=1024):
    """Compute embeddings for text chunks and average them."""
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        with torch.no_grad():
            embedding = model(**inputs).last_hidden_state.mean(dim=1)
            embeddings.append(embedding)
    return torch.mean(torch.stack(embeddings), dim=0)

def split_text_into_chunks(text, tokenizer, max_length=1024):
    """Split text into chunks of max_length tokens."""
    inputs = tokenizer(text, return_tensors='np', truncation=False)
    input_ids = inputs['input_ids'][0]

    chunks = [
        tokenizer.decode(input_ids[i:i + max_length], skip_special_tokens=True)
        for i in range(0, len(input_ids), max_length)
    ]
    return chunks

def tag_text(text, model_name):
    response = ollama.chat(
        model=model_name,
        messages=[{
            "role": "user",
            "content": f"""
            You are a professional tagger. Your task is to analyze a given text and return highly relevant tags to the main topics and themes of the text. 

            Guidelines:
            1. Only provide the tags, nothing else.
            2. Each tag must be a single word, not a phrase.
            3. Separate the tags with commas, without spaces or additional formatting.

            Example Input:
            "Artificial intelligence is transforming industries like healthcare, finance, and transportation."

            Example Output:
            ai,technology,automation

            Now, generate tags for the following text:
            "{text}"
            """
        }]
    )
    if isinstance(response, dict):
        tags = response.get('message', {}).get('content', "error,generating,tags").split(',')
    else:
        tags = ["error", "generating", "tags"]
    return [tag.strip() for tag in tags]

def save_tags_to_csv(output_path, tags, cosine_similarities):
    """Zapisuje tagi i ich podobieństwa do pliku CSV."""
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['tag_name', 'cosine_similarity'])
        for tag, similarity in zip(tags, cosine_similarities):
            csv_writer.writerow([tag, f"{similarity:.4f}"])

def get_file_basename(file_path):
    """Zwraca nazwę pliku bez rozszerzenia."""
    return os.path.splitext(os.path.basename(file_path))[0]

def process_file(file_path, model_name, output_folder, tokenizer, model, chunk_size=1024, iterations=5):
    """Procesuje pojedynczy plik z obsługą iteracji i generowaniem plików z poprawnymi nazwami."""
    with open(file_path, 'r', encoding='utf-8') as f:
        original_text = f.read()

    chunks = split_text_into_chunks(original_text, tokenizer, max_length=chunk_size)
    text_embedding = compute_embeddings(chunks, tokenizer, model, max_length=chunk_size)

    all_tags_per_iteration = []
    all_similarities_per_iteration = []

    chunk_folder = os.path.join(output_folder, f"chunk_{chunk_size}")
    os.makedirs(chunk_folder, exist_ok=True)

    base_filename = get_file_basename(file_path)

    with tqdm(total=iterations, desc=f"Processing {base_filename} iterations", leave=False) as iter_bar:
        for i in range(iterations):
            tags = []
            tag_similarities = []

            for chunk in chunks:
                chunk_tags = tag_text(chunk, model_name)
                tags.extend(chunk_tags)

            for tag in set(tags):
                tag_embedding = compute_embeddings([tag], tokenizer, model, max_length=chunk_size)
                similarity = cosine_similarity(
                    text_embedding.cpu().numpy(),
                    tag_embedding.cpu().numpy()
                )[0][0]
                tag_similarities.append((tag, similarity))

            all_tags_per_iteration.append([tag for tag, _ in tag_similarities])
            all_similarities_per_iteration.append([similarity for _, similarity in tag_similarities])

            # Poprawiona nazwa pliku dla iteracji
            iter_tags_file_path = os.path.join(
                chunk_folder,
                f"{base_filename}_{i+1}.csv"
            )
            save_tags_to_csv(iter_tags_file_path, [tag for tag, _ in tag_similarities], [similarity for _, similarity in tag_similarities])
            iter_bar.update(1)

    best_similarity = -1
    best_tags_file_path = None

    for i, (tags, similarities) in enumerate(zip(all_tags_per_iteration, all_similarities_per_iteration)):
        for tag, similarity in zip(tags, similarities):
            if similarity > best_similarity:
                best_similarity = similarity
                best_tags_file_path = os.path.join(chunk_folder, f"{base_filename}_best_tags.csv")
                save_tags_to_csv(best_tags_file_path, tags, similarities)

    return best_tags_file_path

def initialize_environment(chunk_size):
    """Inicjalizuje środowisko: wybiera tokenizer i model w zależności od chunk_size."""
    tokenizer, model = select_best_tokenizer(chunk_size)
    return tokenizer, model

def read_file_paths(input_file):
    with open(input_file, 'r') as f:
        return [line.strip().replace('.mp3', '.txt') for line in f]

def create_output_folder(model_name):
    safe_model_name = model_name.replace(":", "_").replace("<", "_").replace(">", "_")
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    folder_name = f"{safe_model_name}_{timestamp}"
    output_folder = os.path.join("voiceapp", "output", "tagging_folder", folder_name)
    os.makedirs(output_folder, exist_ok=True)
    return output_folder

def main():
    input_file = 'D:\\Ai\\Audio-Classifier\\voiceapp\\lista-1.txt'
    models_list = [        
        # 'mistral:7b',
        'mistral-small',
        # 'llama3.2:3b',
        # 'qwen2:7b',
        # 'yi',
        'glm4:9b',
        # 'qwen2.5:7b',
        # 'qwen2.5:72b'
    ]

    # Read file paths from the input file
    file_paths = read_file_paths(input_file)

    summary_data = []

    for model_name in models_list:
        print(f"\nProcessing files for model: {model_name}")
        output_folder = create_output_folder(model_name)

        for chunk_size in [128, 512, 1024]:
            print(f"\nProcessing with chunk size: {chunk_size}")

            # Initialize tokenizer and model based on chunk size
            tokenizer, model = initialize_environment(chunk_size)

            with tqdm(total=len(file_paths), desc=f"Processing {model_name}", unit="file") as progress_bar:
                for file_path in file_paths:
                    if os.path.exists(file_path):
                        print(f"Processing file: {file_path}")
                        best_tags_file = process_file(file_path, model_name, output_folder, tokenizer, model, chunk_size)
                        summary_data.append({
                            "Original File Path": file_path,
                            "Model": model_name,
                            "Best Tags File Path": best_tags_file
                        })
                    else:
                        print(f"File not found: {file_path}")
                    progress_bar.update(1)

    # Generate summary CSV
    summary_csv_path = os.path.join("voiceapp", "output", "summary.csv")
    generate_summary_csv(summary_csv_path, summary_data)



In [4]:
def main():
    # Initialize variables
    input_file = 'D:\\Ai\\Audio-Classifier\\voiceapp\\lista-1.txt'
    models_list = [        
        # 'mistral:7b',
        # 'mistral-small',
        #  'llama3.2:3b',
        # 'qwen2:7b',
        # 'yi',
        'glm4:9b',
        # 'qwen2.5:7b',
        # 'qwen2.5:72b'
    ]
    output_folders, tokenizer, model = initialize_environment(models_list)

    # Read file paths
    file_paths = read_file_paths(input_file)

    # Initialize summary tracking
    summary_data = []

    # Process each model sequentially
    for model_name in models_list:
        print(f"\nProcessing all files for model: {model_name}")
        pbar = tqdm(total=len(file_paths), desc=f"Processing {model_name}", unit="file")
        
        for file_path in file_paths:
            output_folder = output_folders[model_name]
            best_tags_file = process_file(file_path, model_name, output_folder, tokenizer, model, iterations=5)
            summary_data.append({
                "Original File Path": file_path,
                "Model": model_name,
                "Tags File Path": best_tags_file
            })

            pbar.update(1)

        pbar.close()

    # Generate summary CSV
    summary_csv_path = 'output_summary.csv'
    generate_summary_csv(summary_csv_path, summary_data)


if __name__ == "__main__":
    main()

ValueError: not enough values to unpack (expected 3, got 2)

In [18]:
def main():
    reference_file = "reference.txt"  # Optional reference file for metrics
    
    # Initialize tokenizer and model
    tokenizer, model = initialize_environment(["sentence-transformers/all-distilroberta-v1"])
    
    # Process each file
    tag_files = []
    for file_path in input_files:
        best_tags_file, _ = process_file_with_metrics(file_path, "model_name", output_folder, tokenizer, model, reference_file=reference_file)
        tag_files.append(best_tags_file)
    
    # Generate summary CSV
    generate_summary_csv(output_folder, input_files, tag_files)

if __name__ == "__main__":
    main()


ValueError: too many values to unpack (expected 2)