In [2]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.0-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   --------------------------------- ------ 9.2/11.1 MB 57.1 MB/s eta 0:00:01
   ---------------------------------------- 11.1/11.1 MB 46.3 MB/s eta 0:00:00
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.6.0 threadpoolctl-3.5.0


DEPRECATION: Loading egg at d:\anaconda3\envs\od_zera_do_ai\lib\site-packages\docopt_ng-0.9.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330
DEPRECATION: Loading egg at d:\anaconda3\envs\od_zera_do_ai\lib\site-packages\sumy-0.11.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [8]:
from tqdm import tqdm
import os
import pandas as pd
from transformers import BertTokenizer, BertModel
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import re
import torch

# Inicjalizacja modelu i tokenizatora
bert_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = BertModel.from_pretrained(bert_model_name)

def preprocess_text(text):
    """
    Usuwa niepotrzebne formatowanie, takie jak nowe linie i znaki specjalne.
    """
    text = re.sub(r'\*\*|\n', '', text)  # Usuń podwójne gwiazdki i nowe linie
    text = re.sub(r'[^\w\s]', ' ', text)  # Usuń interpunkcję
    return text.strip()

def tokenize_text(text):
    """
    Tokenizuje i normalizuje tekst.
    """
    return word_tokenize(preprocess_text(text))

def calculate_meteor(summary_tokens, reference_tokens):
    """
    Oblicza METEOR na podstawie tokenów.
    """
    try:
        reference_sentence = " ".join(reference_tokens)
        summary_sentence = " ".join(summary_tokens)
        return meteor_score([reference_sentence], summary_sentence)
    except Exception as e:
        print(f"Błąd w METEOR: {e}")
        return None

def calculate_bertscore(summary_text, reference_text, tokenizer, model):
    """
    Oblicza BERTScore przy użyciu tokenizatora i modelu z transformers.
    """
    try:
        # Tokenizacja i przekształcanie na tensory
        summary_tokens = tokenizer(summary_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        reference_tokens = tokenizer(reference_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Wyciąganie embeddings z modelu
        with torch.no_grad():
            summary_embeddings = model(**summary_tokens).last_hidden_state.mean(dim=1)
            reference_embeddings = model(**reference_tokens).last_hidden_state.mean(dim=1)

        # Obliczanie podobieństwa kosinusowego
        similarity = torch.nn.functional.cosine_similarity(summary_embeddings, reference_embeddings)
        return similarity.item()
    except Exception as e:
        print(f"Błąd w BERTScore: {e}")
        return None

def calculate_metrics(summary_path, reference_path):
    """
    Oblicza METEOR i BERTScore dla danego podsumowania i referencji.
    """
    try:
        with open(summary_path, 'r', encoding='utf-8') as f:
            summary_text = preprocess_text(f.read())
        with open(reference_path, 'r', encoding='utf-8') as f:
            reference_text = preprocess_text(f.read())

        # Tokenizacja
        summary_tokens = tokenize_text(summary_text)
        reference_tokens = tokenize_text(reference_text)

        if not summary_tokens or not reference_tokens:
            print(f"Błąd: Puste tokeny w plikach {summary_path} lub {reference_path}.")
            return {"METEOR": None, "BERTScore": None}

        # Obliczanie metryk
        meteor = calculate_meteor(summary_tokens, reference_tokens)
        bert_score = calculate_bertscore(summary_text, reference_text, tokenizer, model)

        return {"METEOR": meteor, "BERTScore": bert_score}

    except Exception as e:
        print(f"Błąd podczas obliczania metryk dla {summary_path}: {e}")
        return {"METEOR": None, "BERTScore": None}

def process_files(csv_path):
    """
    Przetwarza podsumowania i referencje wymienione w pliku CSV.
    """
    data = pd.read_csv(csv_path)
    results = []

    # Pasek postępu z tqdm
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Przetwarzanie plików"):
        summary_path = row['summary_path']
        reference_path = row['reference_path']

        # Sprawdź, czy pliki istnieją
        if not os.path.exists(summary_path) or not os.path.exists(reference_path):
            print(f"Błąd: Nie znaleziono pliku {summary_path} lub {reference_path}.")
            results.append({"summary_path": summary_path, "reference_path": reference_path, 
                            "METEOR": None, "BERTScore": None})
            continue

        # Oblicz metryki
        metrics = calculate_metrics(summary_path, reference_path)
        metrics["summary_path"] = summary_path
        metrics["reference_path"] = reference_path
        results.append(metrics)

    return pd.DataFrame(results)

def save_metrics_to_csv(metrics_df, output_file="wyniki_metryk.csv"):
    """
    Zapisuje wyniki metryk do pliku CSV.
    """
    metrics_df.to_csv(output_file, index=False)
    print(f"Wyniki zapisano w {output_file}")

def get_best_candidates(metrics_df):
    """
    Wybiera najlepsze podsumowanie dla każdej referencji na podstawie średnich wyników.
    """
    # Usuwanie wierszy z brakującymi wartościami
    metrics_df = metrics_df.dropna(subset=["METEOR", "BERTScore"])
    metrics_df["Average_Score"] = metrics_df[["METEOR", "BERTScore"]].mean(axis=1)

    # Znajdź najlepsze podsumowanie dla każdej referencji
    best_candidates = metrics_df.loc[
        metrics_df.groupby("reference_path")["Average_Score"].idxmax()
    ]
    return best_candidates

def save_best_candidates_to_csv(best_candidates, output_file="best_candidates.csv"):
    """
    Zapisuje najlepsze podsumowania do pliku CSV.
    """
    best_candidates.to_csv(output_file, index=False)
    print(f"Najlepsze podsumowania zapisano w {output_file}")

if __name__ == "__main__":
    # Konfiguracja wejścia i wyjścia
    input_csv =  "voiceapp/output/summaries_folder/summaries_list.csv" # CSV z kolumnami 'summary_path', 'reference_path'
    metrics_output_csv = "wyniki_metryk.csv"
    best_candidates_output_csv = "best_candidates.csv"

    # Przetwarzanie plików
    print("Rozpoczynam przetwarzanie plików...")
    metrics_df = process_files(input_csv)

    # Zapis wyników metryk
    save_metrics_to_csv(metrics_df, metrics_output_csv)

    # Wybór najlepszych podsumowań
    best_candidates = get_best_candidates(metrics_df)
    save_best_candidates_to_csv(best_candidates, best_candidates_output_csv)

    print("Przetwarzanie zakończone!")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\G\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Processing files and calculating metrics...
Processing: voiceapp\output\summaries_folder\mistral_7b_20241213_063132\CJG_01_2023_01_14-summary_1.txt, C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Error calculating metrics for voiceapp\output\summaries_folder\mistral_7b_20241213_063132\CJG_01_2023_01_14-summary_1.txt: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): 1 Key Themes The key themes discussed in this text revolve around economic knowledge political views and hidden matters that influence global perspectives 2 Impacts This text has significant impacts on the economic technological political and social dimensions Economically it encourages understanding the impact of money on societal structures and decision making processes Technologically it does not explicitly discuss technology but implies the role of digital currencies like Bitcoin Politically it challenges the listener s preconceived notions about politics by highlighting the 

In [3]:

# Main script to process and calculate BERTScore for each summary-reference pair
if __name__ == "__main__":
    # Path to the CSV file containing pairs of summary and reference paths
    csv_path = "voiceapp/output/summaries_folder/summaries_list.csv"

    # Process the files and calculate BERTScores
    metrics_df = process_files(csv_path)

    # Save the calculated metrics to CSV
    save_metrics_to_csv(metrics_df)

    # Get the best candidates for each reference and save them to CSV
    best_candidates = get_best_candidates(metrics_df)
    save_best_candidates_to_csv(best_candidates)
   

Processing: voiceapp\output\summaries_folder\mistral_7b_20241213_063132\CJG_01_2023_01_14-summary_1.txt, C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\CJG_01_2023_01_14.txt
Error calculating metrics for voiceapp\output\summaries_folder\mistral_7b_20241213_063132\CJG_01_2023_01_14-summary_1.txt: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): 1 Key Themes The key themes discussed in this text revolve around economic knowledge political views and hidden matters that influence global perspectives 2 Impacts This text has significant impacts on the economic technological political and social dimensions Economically it encourages understanding the impact of money on societal structures and decision making processes Technologically it does not explicitly discuss technology but implies the role of digital currencies like Bitcoin Politically it challenges the listener s preconceived notions about politics by highlighting the close relationship between politics and busi

In [19]:
metrics_df.sample()

Unnamed: 0,METEOR,BERTScore,summary_path,reference_path
41,,,voiceapp\output\summaries_folder\mistral-small...,C:/Users/G/Documents/GitHub/audycje.com.pl/con...


In [28]:
with open("voiceapp/lista.txt", "r") as f:
    references = f.readlines()
    print(references)

['C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\\CJG_01_2023_01_14.mp3\n', 'C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\\CJG_02_2023_01_21.mp3\n', 'C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\\CJG_03_2023_01_28.mp3\n', 'C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\\CJG_04_2023_02_04.mp3\n', 'C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\\CJG_05_2023_02_11.mp3\n', 'C:/Users/G/Documents/GitHub/audycje.com.pl/content/audio\\CJG_06_2023_02_18.mp3\n']
