In [5]:
import os
import csv
import time
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import ollama
from difflib import SequenceMatcher

# Load a pre-trained model for semantic similarity
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-distilroberta-v1")
model = AutoModel.from_pretrained("sentence-transformers/all-distilroberta-v1")

# Function to compute embeddings
def compute_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

# Function to compute cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
    return similarity.item()

# Function to split text into chunks of approximately 100 words
def split_into_chunks(text, chunk_size=100):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to chunk the tags list to avoid exceeding the token limit
def chunk_tags(tag_list, max_tags_per_chunk=100):
    return [tag_list[i:i + max_tags_per_chunk] for i in range(0, len(tag_list), max_tags_per_chunk)]

# Helper to get the current timestamp
def get_timestamp():
    return time.strftime("%Y%m%d_%H%M%S")

# Function to create a folder based on the model name and timestamp
def create_output_folder(model_name):
    """
    Creates a timestamped output folder for a given model name.
    Replaces forbidden characters in the folder name.
    """
    # Create the base output and tagging folders
    base_output_path = os.path.join("voiceapp", "output")
    combined_tags_folder_path = os.path.join(base_output_path, "combined_tags_folder")
    
    os.makedirs(base_output_path, exist_ok=True)
    os.makedirs(combined_tags_folder_path, exist_ok=True)

    # Sanitize the model name by replacing forbidden characters with '_'
    safe_model_name = re.sub(r'[<>:"/\\|?*]', '_', model_name)

    # Generate a timestamp and create the final folder name
    timestamp = get_timestamp()
    folder_name = f"{safe_model_name}_{timestamp}"

    # Create the model-specific folder
    model_folder_path = os.path.join(combined_tags_folder_path, folder_name)
    os.makedirs(model_folder_path, exist_ok=True)

    return model_folder_path

# Step 1: Combine all tags into combine_tags.csv
def combine_tags_from_files(lista_path, output_folder):
    combined_tags = set()
    with open(lista_path, 'r') as f:
        file_paths = [line.strip() for line in f.readlines()]

    total_files = len(file_paths)
    pbar = tqdm(total=total_files, desc="Combining tags from files")

    for file_path in file_paths:
        file_path = file_path.replace('.mp3', '_best_tags.csv')
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    if 'tag_name' in row:
                        combined_tags.add(row['tag_name'].strip())
        pbar.update(1)
    pbar.close()

    # Write to combine_tags.csv
    output_path = os.path.join(output_folder, 'combine_tags.csv')
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['tag_name'])
        for tag in sorted(combined_tags):  # Sort for consistency
            writer.writerow([tag])
    return output_path

# Step 2: Clean tags
def clean_tags(input_path, output_folder):
    def is_similar(word1, word2, threshold=0.8):
        return SequenceMatcher(None, word1, word2).ratio() > threshold

    cleaned_tags = []
    with open(input_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        tags = [row['tag_name'].strip() for row in reader if 'tag_name' in row]

    # Remove duplicates and similar tags
    for tag in tags:
        cleaned_tag = tag.strip()
        if cleaned_tag not in cleaned_tags and len(cleaned_tag) <= 50 and re.match("^[a-zA-Z0-9_-]+$", cleaned_tag):
            if all(not is_similar(cleaned_tag, existing_tag) for existing_tag in cleaned_tags):
                cleaned_tags.append(cleaned_tag)

    # Write cleaned tags to a new file
    output_path = os.path.join(output_folder, 'cleaned_tags.csv')
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['tag_name'])
        for tag in sorted(cleaned_tags):  # Sort for consistency
            writer.writerow([tag])
    return output_path

# Step 3: Generate relevant tags using cleaned tags for each file
def generate_tags_with_llm(lista_path, cleaned_tags_path, output_folder, model_name):
    # Load predefined tags
    with open(cleaned_tags_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        predefined_tags = [row['tag_name'].strip() for row in reader if 'tag_name' in row]

    max_tags = 50
    predefined_tags = predefined_tags[:max_tags]

    with open(lista_path, 'r') as f:
        file_paths = [line.strip().replace('.mp3', '.txt') for line in f.readlines()]

    total_files = len(file_paths)
    pbar = tqdm(total=total_files, desc=f"Generating tags with LLM ({model_name})")

    for file_path in file_paths:
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                original_text = f.read()

            chunks = split_into_chunks(original_text, chunk_size=100)
            all_tags_with_similarity = []
            used_tags = set()  # Track tags that have already been used for this file

            for chunk in chunks:
                # Filter predefined_tags to exclude used tags
                available_tags = [tag for tag in predefined_tags if tag not in used_tags]

                # If no tags are left, skip further processing for this chunk
                if not available_tags:
                    break

                response = ollama.chat(
                    model=model_name,
                    messages=[{
                        "role": "user",
                        "content": f"""
                        You are a professional tagger. Your task is to analyze the given text and select exactly 3 relevant tags from the provided list of tags.

                        Instructions:
                        1. Do not summarize the text.
                        2. Only select 3 tags from the provided list.
                        3. Do not provide any extra text or explanations, just return the 3 tags.
                        4. Separate the tags with commas (without spaces or other punctuation).

                        Select 3 tags from the following list:
                        {', '.join(available_tags)}

                        Text:
                        "{chunk}"
                        """
                    }]
                )

                tags = []
                if isinstance(response, list):
                    for message in response:
                        if 'content' in message:
                            tags = [tag.strip() for tag in message['content'].strip().split(',')]
                            break
                elif isinstance(response, dict):
                    tags = [tag.strip() for tag in response.get('message', {}).get('content', "No content available").strip().split(',')]
                else:
                    tags = ["error", "generating", "tags"]

                # Update used tags to ensure no duplicates are generated for this file
                for tag in tags:
                    if tag in available_tags:
                        used_tags.add(tag)

                # Compute cosine similarity for each tag
                chunk_embedding = compute_embeddings([chunk])
                for tag in tags:
                    tag_embedding = compute_embeddings([tag])
                    similarity = compute_cosine_similarity(tag_embedding, chunk_embedding)
                    all_tags_with_similarity.append((tag, similarity))

            unique_tags_with_similarity = list(set(all_tags_with_similarity))

            output_csv_path = os.path.join(output_folder, os.path.basename(file_path).replace('.txt', '_tag_final.csv'))
            with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['tag_name', 'cosine_similarity'])
                for tag, similarity in sorted(unique_tags_with_similarity, key=lambda x: x[1], reverse=True):
                    writer.writerow([tag.strip(), f"{similarity:.4f}"])

        pbar.update(1)

    pbar.close()




In [6]:
def main():
    # Define base folder paths
    base_folder = "voiceapp/output"
    model_name = "mistral-small"

    # Step 1: Create output folder for the model
    output_folder = create_output_folder(model_name)

    # Step 2: Combine tags from all *_best_tags.csv files
    print("Combining tags from all *_best_tags.csv files...")
    combine_tags_path = combine_tags_from_files(base_folder, output_folder)
    print(f"Combined tags saved at: {combine_tags_path}")

    # Step 3: Clean combined tags
    print("Cleaning combined tags...")
    cleaned_tags_path = clean_tags(combine_tags_path, output_folder)
    print(f"Cleaned tags saved at: {cleaned_tags_path}")

    # Step 4: Generate tags for input files
    lista_path = os.path.join(base_folder, "lista.txt")  # Ensure this file exists
    if os.path.exists(lista_path):
        print("Generating relevant tags for each file in lista.txt...")
        generate_tags_with_llm(lista_path, cleaned_tags_path, output_folder, model_name)
        print(f"Tag generation completed. Output saved in: {output_folder}")
    else:
        print(f"File 'lista.txt' not found at: {lista_path}. Please create this file with paths to input files.")

if __name__ == "__main__":
    main()


Processing model: mistral:7b


Combining tags from files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 97/97 [00:00<00:00, 308.07it/s]
Generating tags with LLM (mistral:7b):   2%|███                                                                                                                                                   | 2/97 [00:07<05:54,  3.74s/it]

KeyboardInterrupt: 

In [1]:
import os
import csv
import time
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import ollama
from difflib import SequenceMatcher

# Helper to get the current timestamp
def get_timestamp():
    return time.strftime("%Y%m%d_%H%M%S")

# Function to create a folder based on the model name and timestamp
def create_output_folder(model_name):
    """
    Creates a timestamped output folder for a given model name.
    Replaces forbidden characters in the folder name.
    """
    # Create the base output and tagging folders
    base_output_path = os.path.join("voiceapp", "output")
    combined_tags_folder_path = os.path.join(base_output_path, "combined_tags_folder")
    
    os.makedirs(base_output_path, exist_ok=True)
    os.makedirs(combined_tags_folder_path, exist_ok=True)

    # Sanitize the model name by replacing forbidden characters with '_'
    safe_model_name = re.sub(r'[<>:"/\\|?*]', '_', model_name)

    # Generate a timestamp and create the final folder name
    timestamp = get_timestamp()
    folder_name = f"{safe_model_name}_{timestamp}"

    # Create the model-specific folder
    model_folder_path = os.path.join(combined_tags_folder_path, folder_name)
    os.makedirs(model_folder_path, exist_ok=True)

    return model_folder_path


# Load a pre-trained model for semantic similarity
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-distilroberta-v1")
model = AutoModel.from_pretrained("sentence-transformers/all-distilroberta-v1")

# Function to compute embeddings
def compute_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

# Function to compute cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
    return similarity.item()

# Function to split text into chunks of approximately 100 words
def split_into_chunks(text, chunk_size=100):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to recursively search for CSV files containing '_best_tags.csv'
def collect_tags_from_files(root_folder):
    combined_tags = set()
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('_best_tags.csv'):
                file_path = os.path.join(dirpath, filename)
                with open(file_path, 'r', encoding='utf-8') as csvfile:
                    reader = csv.DictReader(csvfile)
                    for row in reader:
                        if 'tag_name' in row:
                            combined_tags.add(row['tag_name'].strip())
    return combined_tags

import re
from difflib import SequenceMatcher

# Funkcja czyszcząca tagi
def clean_tags(tags):
    def is_similar(word1, word2, threshold=0.8):
        """Sprawdza, czy dwa tagi są podobne na podstawie współczynnika podobieństwa."""
        return SequenceMatcher(None, word1, word2).ratio() > threshold

    cleaned_tags = []
    
    for tag in tags:
        # Konwertujemy tag na małe litery
        cleaned_tag = tag.strip().lower()
        
        # Zastępujemy wszystkie znaki specjalne (z wyjątkiem ostatniego znaku) na "_"
        if len(cleaned_tag) > 1:
            cleaned_tag = re.sub(r'[^a-zA-Z0-9_-]', '_', cleaned_tag[:-1]) + cleaned_tag[-1]
        else:
            cleaned_tag = re.sub(r'[^a-zA-Z0-9_-]', '_', cleaned_tag)

        # Sprawdzamy, czy tag jest unikalny i czy spełnia warunki długości
        if cleaned_tag not in cleaned_tags and len(cleaned_tag) <= 50 and re.match("^[a-zA-Z0-9_-]+$", cleaned_tag):
            if all(not is_similar(cleaned_tag, existing_tag) for existing_tag in cleaned_tags):
                cleaned_tags.append(cleaned_tag)
    
    return cleaned_tags

# Function to save cleaned tags to a CSV file
def save_cleaned_tags(cleaned_tags, output_folder):
    output_path = os.path.join(output_folder, 'cleaned_tags.csv')
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['tag_name'])
        for tag in sorted(cleaned_tags):  # Sort for consistency
            writer.writerow([tag])
    return output_path

# Function to generate tags with LLM
def generate_tags_with_llm(lista_path, cleaned_tags_path, output_folder, model_name):
    # Load predefined tags
    with open(cleaned_tags_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        predefined_tags = [row['tag_name'].strip() for row in reader if 'tag_name' in row]

    max_tags = 50
    predefined_tags = predefined_tags[:max_tags]

    # Read file paths from lista.txt and change .mp3 to .txt
    with open(lista_path, 'r') as f:
        file_paths = [line.strip().replace('.mp3', '.txt') for line in f.readlines()]

    total_files = len(file_paths)
    pbar = tqdm(total=total_files, desc=f"Generating tags with LLM ({model_name})")
    
    for file_path in file_paths:
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    original_text = f.read()

                # Split the text into chunks
                chunks = split_into_chunks(original_text, chunk_size=100)
                all_tags_with_similarity = []
                used_tags = set()

                for chunk in chunks:
                    available_tags = [tag for tag in predefined_tags if tag not in used_tags]
                    if not available_tags:
                        break

                    response = ollama.chat(
                        model=model_name,
                        messages=[{
                            "role": "user",
                            "content": f"""
                        You are a professional tagger. Your task is to analyze the given text and select best relevant tags from the provided list of tags.

                        Instructions:
                        1. Do not summarize the text.
                        2. Only select tags from the provided list.
                        3. Do not provide any extra text or explanations, just return the best tags.
                        4. Separate the tags with commas (without spaces or other punctuation).

                        Select 3 tags from the following list:
                        {', '.join(available_tags)}

                        Text:
                        "{chunk}"
                        """
                        }]
                    )

                    tags = response.get('message', {}).get('content', "No content available").strip().split(',')
                    for tag in tags:
                        if tag in available_tags:
                            used_tags.add(tag)

                    chunk_embedding = compute_embeddings([chunk])
                    for tag in tags:
                        tag_embedding = compute_embeddings([tag])
                        similarity = compute_cosine_similarity(tag_embedding, chunk_embedding)
                        all_tags_with_similarity.append((tag, similarity))

                unique_tags_with_similarity = list(set(all_tags_with_similarity))
                output_csv_path = os.path.join(output_folder, os.path.basename(file_path).replace('.txt', '_tags_final.csv'))
                with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['tag_name', 'cosine_similarity'])
                    for tag, similarity in sorted(unique_tags_with_similarity, key=lambda x: x[1], reverse=True):
                        writer.writerow([tag.strip(), f"{similarity:.4f}"])
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
        else:
            print(f"File {file_path} not found. Skipping.")
        
        pbar.update(1)
    pbar.close()

# Main function
def main():
    model_name = "mistral-small"  # Model name (Mistral)

    # Step 1: Create output folder
    print("Creating output folder...")
    output_folder = create_output_folder(model_name)

    root_folder = "voiceapp/output/tagging_folder"  # Folder containing '_best_tags.csv' files
    lista_path = "voiceapp/lista-1.txt"  # Path to lista.txt

    # Step 2: Collect tags from files
    print("Collecting tags from files...")
    combined_tags = collect_tags_from_files(root_folder)
    print(f"Collected {len(combined_tags)} tags.")

    # Step 3: Clean the tags
    print("Cleaning tags...")
    cleaned_tags = clean_tags(combined_tags)
    print(f"Cleaned {len(cleaned_tags)} tags.")

    # Step 4: Save cleaned tags to CSV
    cleaned_tags_path = save_cleaned_tags(cleaned_tags, output_folder)
    print(f"Cleaned tags saved to {cleaned_tags_path}")

    # Step 5: Generate tags for each transcription
    print("Generating tags with LLM...")
    generate_tags_with_llm(lista_path, cleaned_tags_path, output_folder, model_name)
    print("Tag generation completed.")

if __name__ == "__main__":
    main()


Creating output folder...
Collecting tags from files...
Collected 851 tags.
Cleaning tags...
Cleaned 716 tags.
Cleaned tags saved to voiceapp\output\combined_tags_folder\mistral-small_20241221_005553\cleaned_tags.csv
Generating tags with LLM...


Generating tags with LLM (mistral-small):  58%|██████████▍       | 56/97 [08:36<02:55,  4.28s/it]

File C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - dĹ‚ugi 1.txt not found. Skipping.


Generating tags with LLM (mistral-small):  73%|█████████████▏    | 71/97 [09:19<01:23,  3.23s/it]

File C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - los klasy Ĺ›redniej 1.txt not found. Skipping.


Generating tags with LLM (mistral-small):  75%|█████████████▌    | 73/97 [09:21<00:54,  2.26s/it]

File C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - mieszkania dla mĹ‚odych 1.txt not found. Skipping.


Generating tags with LLM (mistral-small):  89%|███████████████▉  | 86/97 [09:55<00:31,  2.91s/it]

File C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - upadek bankĂłw 1.txt not found. Skipping.


Generating tags with LLM (mistral-small):  94%|████████████████▉ | 91/97 [10:06<00:14,  2.46s/it]

File C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel cz1.txt not found. Skipping.
File C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - wÄ™giel po zimie 1.txt not found. Skipping.


Generating tags with LLM (mistral-small): 100%|██████████████████| 97/97 [10:18<00:00,  6.38s/it]

File C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\shorts\CJG - Ĺ‚apĂłwki 1.txt not found. Skipping.
Tag generation completed.



