In [None]:
import pandas as pd
from transformers import pipeline
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from math import exp
from tqdm import tqdm 

In [None]:
train_df1 = pd.read_csv('/kaggle/input/text-classification/train.csv')
train_df=train_df1.head(150)

In [None]:
models = [
    "facebook/bart-large-cnn",
    "t5-large",
    "sshleifer/distilbart-cnn-12-6",
    "google/pegasus-large",
    "allenai/led-large-16384-arxiv",
    "sshleifer/bart-tiny-random",
  
]


In [None]:
def semantic_coherence(generated_summary, dialogue):
    summary_tokens = word_tokenize(generated_summary.lower())
    dialogue_tokens = word_tokenize(dialogue.lower())
    common_tokens = set(summary_tokens) & set(dialogue_tokens)
    coherence_score = len(common_tokens) / len(summary_tokens)
    return coherence_score


In [None]:
def factual_accuracy(generated_summary, reference_summary):
    gen_tokens = set(word_tokenize(generated_summary.lower()))
    ref_tokens = set(word_tokenize(reference_summary.lower()))
    common_tokens = gen_tokens & ref_tokens
    accuracy_score = len(common_tokens) / len(ref_tokens) if len(ref_tokens) != 0 else 0
    return accuracy_score


In [None]:
def content_coverage(generated_summary, dialogue):
    summary_tokens = set(word_tokenize(generated_summary.lower()))
    dialogue_tokens = set(word_tokenize(dialogue.lower()))
    common_tokens = summary_tokens & dialogue_tokens
    coverage_score = len(common_tokens) / len(dialogue_tokens) if len(dialogue_tokens) != 0 else 0
    return coverage_score


In [None]:
from math import log
def fluency(generated_summary):
    tokens = word_tokenize(generated_summary.lower())
    token_freq = Counter(tokens)
    total_log_prob = sum(-log(token_freq[token] / len(tokens)) for token in token_freq)
    perplexity = exp(total_log_prob / len(tokens))
    return perplexity

In [None]:
evaluation_results_list = []

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download the 'punkt' resource
nltk.download('punkt')



In [None]:
from collections import Counter


In [None]:
for model_name in models:
    print(f"Evaluating model: {model_name}")
    summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)
    semantic_coherence_scores = []
    factual_accuracy_scores = []
    content_coverage_scores = []
    fluency_scores = []
    generated_summaries = []
    for index, row in tqdm(train_df.iterrows(), total=len(train_df)):  # Use tqdm to show progress
        dialogue = row['dialogue']
        summary = row['summary']
        generated_summary = summarizer(dialogue, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]
        generated_summaries.append(generated_summary)
        coherence_score = semantic_coherence(generated_summary, dialogue)
        semantic_coherence_scores.append(coherence_score)
        accuracy_score = factual_accuracy(generated_summary, summary)
        factual_accuracy_scores.append(accuracy_score)
        coverage_score = content_coverage(generated_summary, dialogue)
        content_coverage_scores.append(coverage_score)
        fluency_score = fluency(generated_summary)
        fluency_scores.append(fluency_score)
    evaluation_results_list.append({
        "Model": model_name,
        "Semantic Coherence": sum(semantic_coherence_scores) / len(semantic_coherence_scores),
        "Factual Accuracy": sum(factual_accuracy_scores) / len(factual_accuracy_scores),
        "Content Coverage": sum(content_coverage_scores) / len(content_coverage_scores),
        "Fluency": sum(fluency_scores) / len(fluency_scores)
    })
    print("="*50)

In [None]:
print(len(evaluation_results_list))


In [None]:
evaluation_results = pd.concat([pd.DataFrame(item, index=[0]) for item in evaluation_results_list], ignore_index=True)


In [None]:
evaluation_results.to_csv("evaluation_result.csv", index=False)

In [None]:
evaluation_results.head()