In [10]:
import ollama
import os
import time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import re
from datetime import datetime
import csv
import os


# Utility Functions
def get_timestamp():
    """Get the current timestamp."""
    return datetime.now().strftime('%Y%m%d_%H%M%S')


def create_output_folder(model_name):
    """
    Creates a timestamped output folder for a given model name.
    Replaces forbidden characters in the folder name.
    """
    base_output_path = os.path.join("voiceapp", "output")
    summary_folder_path = os.path.join(base_output_path, "summaries_folder")
    
    os.makedirs(base_output_path, exist_ok=True)
    os.makedirs(summary_folder_path, exist_ok=True)

    # Sanitize the model name by replacing forbidden characters
    safe_model_name = re.sub(r'[<>:"/\\|?*]', '_', model_name)

    timestamp = get_timestamp()
    folder_name = f"{safe_model_name}_{timestamp}"

    model_folder_path = os.path.join(summary_folder_path, folder_name)
    os.makedirs(model_folder_path, exist_ok=True)

    return summary_folder_path, model_folder_path


def compute_similarity(text1, text2, tokenizer, model):
    """Compute cosine similarity between two text embeddings."""
    inputs1 = tokenizer(text1, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs2 = tokenizer(text2, return_tensors='pt', padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        emb1 = model(**inputs1).last_hidden_state.mean(dim=1)
        emb2 = model(**inputs2).last_hidden_state.mean(dim=1)

    cosine_sim = torch.nn.functional.cosine_similarity(emb1, emb2)
    return cosine_sim.item()


def load_file_paths(file_path):
    """Load and clean file paths from a given list."""
    with open(file_path, 'r') as f:
        return [line.strip() for line in f.readlines()]


def chunk_text(text, chunk_size=1000):
    """Split text into smaller chunks to fit within LLM token limits."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def generate_summaries_for_chunk(chunk, model_name, tokenizer, embed_model, output_folder):
    """Generate summary for a single chunk of text."""
    response = ollama.chat(
        model=model_name,
        messages=[{
            "role": "user",
            "content": f"""
                ---
                **"You are a copywriter. Your task is to summarize the provided text according to the following framework:**
                1. **Key Themes**: Identify and explain the main themes or topics discussed in the text.
                2. **Impacts**: Assess the broader impacts, highlighting economic, technological, political, and social dimensions.
                3. **Examples and Evidence**: Draw connections to real-world examples or supporting evidence that underline the key points.
                4. **Opportunities and Risks**: Explore potential opportunities and risks suggested by the text.
                5. **Conclusion**: Summarize the implications and suggest future considerations or actions that align with the insights presented in the text.
                ---
            "{chunk}" """
        }]
    )
    return extract_summary(response)

def create_csv_file(csv_path):
    """Creates a CSV file if it does not exist and writes headers."""
    if not os.path.exists(csv_path):
        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['summary_path', 'reference_path']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()


def is_record_in_csv(csv_path, summary_path, reference_path):
    """Checks if the record (summary_path, reference_path) already exists in the CSV."""
    if not os.path.exists(csv_path):
        return False
    
    with open(csv_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['summary_path'] == summary_path and row['reference_path'] == reference_path:
                return True
    return False


def save_to_csv(csv_path, summary_path, reference_path):
    """Appends a record to the CSV file if it does not exist."""
    if not is_record_in_csv(csv_path, summary_path, reference_path):
        with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['summary_path', 'reference_path']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow({'summary_path': summary_path, 'reference_path': reference_path})
    else:
        print(f"Record already exists: {summary_path} -> {reference_path}")

def extract_summary(response):
    """Extract summary content from the model response."""
    if isinstance(response, list):
        for message in response:
            if 'content' in message:
                return message['content']
    elif isinstance(response, dict):
        return response.get('message', {}).get('content', "No content available")
    else:
        return "Error summarizing file"

def generate_summaries(file_path, model_name, tokenizer, embed_model, summary_folder, iterations=5, csv_path=None):
    """Generates summaries for a single file, saves each iteration and appends paths to CSV if not exist."""
    summaries = []
    similarity_scores = []

    with open(file_path, 'r', encoding='utf-8-sig') as f:
        text = f.read()

    # Chunk the text
    chunks = chunk_text(text)

    base_filename = os.path.basename(file_path).split('.txt')[0]

    for chunk in chunks:
        for i in range(1, iterations + 1):
            summary = generate_summaries_for_chunk(chunk, model_name, tokenizer, embed_model, summary_folder)
            summaries.append(summary)

            # Compute similarity
            similarity = compute_similarity(text, summary, tokenizer, embed_model)
            similarity_scores.append(similarity)

            summary_file_path = os.path.join(summary_folder, f"{base_filename}-summary_{i}.txt")
            try:
                with open(summary_file_path, 'w', encoding='utf-8') as f:
                    f.write(summary)

                # Save to CSV
                if csv_path:
                    save_to_csv(csv_path, summary_file_path, file_path)

            except UnicodeEncodeError as e:
                print(f"Encoding error when writing summary {i} for file {file_path}: {e}")

    return summaries


def process_files_for_model(model_name, file_paths, tokenizer, embed_model):
    """Processes all files for a given model and saves results to CSV."""
    # Create both the summaries folder and model folder with timestamp
    summary_folder, model_folder = create_output_folder(model_name)

    # Define the CSV path in the summaries folder (one level back)
    csv_path = os.path.join(summary_folder, 'summaries_list.csv')

    create_csv_file(csv_path)

    pbar = tqdm(total=len(file_paths), desc=f"Summarizing files for {model_name}")

    for file_path in file_paths:
        file_path = file_path.replace('.mp3', '.txt').lstrip()

        if os.path.exists(file_path):
            start_time = time.time()
            summaries = generate_summaries(file_path, model_name, tokenizer, embed_model, model_folder, csv_path=csv_path)

            pbar.update(1)
            pbar.set_postfix({
                'file': os.path.basename(file_path),
                'summaries_count': len(summaries),
                'time': f"{time.time() - start_time:.2f} seconds"
            })

    pbar.close()



In [12]:

def main():
    """Main function to run the program."""
    models = [        
        'mistral:7b',
        'mistral-small',
        'llama3.2:3b',
        'qwen2:7b',
        'yi',
        'glm4:9b',
        'qwen2.5:7b',
    ]
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-distilroberta-v1")
    embed_model = AutoModel.from_pretrained("sentence-transformers/all-distilroberta-v1")
    file_paths = load_file_paths('voiceapp\\lista.txt')

    for model_name in models:
        process_files_for_model(model_name, file_paths, tokenizer, embed_model)


if __name__ == "__main__":
    main()


Summarizing files for mistral:7b:   0%|                                                                                                                                                                    | 0/6 [00:00<?, ?it/s][A

Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_01_2023_01_14-summary_1.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_01_2023_01_14-summary_2.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_01_2023_01_14-summary_3.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_01_2023_01_14-summary_4.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_01_2023_01_14-summary_5.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Record already 


Summarizing files for mistral:7b:  17%|██████████████████████████                                                                                                                                  | 1/6 [00:54<04:32, 54.41s/it][A
Summarizing files for mistral:7b:  17%|██████████████▋                                                                         | 1/6 [00:54<04:32, 54.41s/it, file=CJG_01_2023_01_14.txt, summaries_count=15, time=54.41 seconds][A

Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_01_2023_01_14-summary_5.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_02_2023_01_21-summary_1.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_02_2023_01_21.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_02_2023_01_21-summary_2.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_02_2023_01_21.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_02_2023_01_21-summary_3.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_02_2023_01_21.txt
Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_02_2023_01_21-summary_4.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_02_2023_01_21.txt



Summarizing files for mistral:7b:  33%|█████████████████████████████▎                                                          | 2/6 [01:35<03:06, 46.72s/it, file=CJG_01_2023_01_14.txt, summaries_count=15, time=54.41 seconds][A
Summarizing files for mistral:7b:  33%|█████████████████████████████▎                                                          | 2/6 [01:35<03:06, 46.72s/it, file=CJG_02_2023_01_21.txt, summaries_count=10, time=41.33 seconds][A

Record already exists: voiceapp\output\summaries_folder\mistral_7b_20241208_213239\CJG_02_2023_01_21-summary_5.txt -> C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_02_2023_01_21.txt


KeyboardInterrupt: 

In [None]:
#1 Prompt version
def generate_summaries_for_chunk(chunk, model_name, tokenizer, embed_model, output_folder):
    """Generate summary for a single chunk of text."""
    response = ollama.chat(
        model=model_name,
        messages=[{
            "role": "assistant",
            "content": f"""
                ---
                **"You are a copywriter. Your task is to summarize the provided text according to the following framework:**
                1. **Key Themes**: Identify and explain the main themes or topics discussed in the text.
                2. **Impacts**: Assess the broader impacts, highlighting economic, technological, political, and social dimensions.
                3. **Examples and Evidence**: Draw connections to real-world examples or supporting evidence that underline the key points.
                4. **Opportunities and Risks**: Explore potential opportunities and risks suggested by the text.
                5. **Conclusion**: Summarize the implications and suggest future considerations or actions that align with the insights presented in the text.
                ---
            """,
            "role": "user",
            "content": chunk,
        }]
    )
    return extract_summary(response)

In [None]:
def generate_summaries_for_chunk(chunk, model_name, tokenizer, embed_model, output_folder):
    """Generate summary for a single chunk of text."""
    response = ollama.chat(
        model=model_name,
        messages=[{
            "role": "user",
            "content": f"""
                ---
                **"You are a copywriter. Your task is to summarize the provided text according to the following framework:**
                1. **Key Themes**: Identify and explain the main themes or topics discussed in the text.
                2. **Impacts**: Assess the broader impacts, highlighting economic, technological, political, and social dimensions.
                3. **Examples and Evidence**: Draw connections to real-world examples or supporting evidence that underline the key points.
                4. **Opportunities and Risks**: Explore potential opportunities and risks suggested by the text.
                5. **Conclusion**: Summarize the implications and suggest future considerations or actions that align with the insights presented in the text.
                ---
            "{chunk}" """
        }]
    )
    return extract_summary(response)

In [None]:
def generate_summaries_for_chunk(chunk, model_name, tokenizer, embed_model, output_folder):
    """Generate summary for a single chunk of text."""
    response = ollama.chat(
        model=model_name,
        messages=[{
            "role": "user",
            "content": f"""
                Imagine you are a university professor specializing in social sciences, particularly in the field of technology and its societal impacts. You are tasked with summarizing an academic article about the influence of emerging technologies, such as artificial intelligence (AI) and automation, on global economies and workforces. The article includes case studies, statistical data, and theoretical frameworks.

                Please provide a detailed and well-structured summary of the article, focusing on the following elements:
                
                1. **Introduction**: Explain the topic of the article, including the emerging technologies being discussed and their broader societal and economic implications. Provide context on why this topic is significant.
                2. **Main arguments**: Describe the key discussions or theories presented in the article. Include insights into the challenges and opportunities these technologies create, supported by relevant examples or case studies discussed in the text.
                3. **Findings and conclusions**: Outline the main conclusions of the article, including any recommendations, policy suggestions, or future considerations highlighted by the authors.
                
                Avoid including specific data points, figures, or direct quotes, but ensure that the summary captures the depth and breadth of the article's content. Maintain a formal and academic tone, ensuring the summary is clear, coherent, and suitable for professional use.
                
                ---
                
                **Example of the expected summary:**
                
                1. **Introduction**: This article examines the societal and economic impacts of emerging technologies, particularly artificial intelligence (AI) and automation, focusing on their transformative effects on industries and labor markets worldwide. It highlights the growing significance of these technologies in reshaping global economies and emphasizes the urgency of understanding their implications for workforce adaptation.  
                2. **Main arguments**: The article explores how automation and AI have the potential to displace traditional jobs while simultaneously creating new roles in tech-related fields. It provides case studies from countries like the United States, China, and Germany, showcasing varying strategies for adaptation. Challenges such as skill gaps and economic inequality are discussed alongside opportunities for innovation and efficiency.  
                3. **Findings and conclusions**: The authors conclude that governments and industries must collaborate to address the challenges posed by technological advancements. Policy recommendations include investing in workforce retraining programs, emphasizing STEM education, and creating safety nets for displaced workers. The article underscores the need for proactive measures to ensure that technological progress leads to inclusive economic growth.  

            "{chunk}" """
        }]
    )
    return extract_summary(response)