In [10]:
import pandas as pd
from transformers import pipeline
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from math import exp
from tqdm import tqdm 

In [11]:
train_df1 = pd.read_csv("C:/Users/Abhinandan Sharma/Desktop/topsis_asg/train.csv")
train_df=train_df1.head(150)

In [22]:
models = [
    "facebook/bart-large-cnn",
    "t5-large",
    "sshleifer/distilbart-cnn-12-6",
    "google/pegasus-large",
    "allenai/led-large-16384-arxiv",
    "sshleifer/bart-tiny-random",
  
]

In [13]:
def semantic_coherence(generated_summary, dialogue):
    summary_tokens = word_tokenize(generated_summary.lower())
    dialogue_tokens = word_tokenize(dialogue.lower())
    common_tokens = set(summary_tokens) & set(dialogue_tokens)
    coherence_score = len(common_tokens) / len(summary_tokens)
    return coherence_score


In [14]:
def factual_accuracy(generated_summary, reference_summary):
    gen_tokens = set(word_tokenize(generated_summary.lower()))
    ref_tokens = set(word_tokenize(reference_summary.lower()))
    common_tokens = gen_tokens & ref_tokens
    accuracy_score = len(common_tokens) / len(ref_tokens) if len(ref_tokens) != 0 else 0
    return accuracy_score


In [15]:
def content_coverage(generated_summary, dialogue):
    summary_tokens = set(word_tokenize(generated_summary.lower()))
    dialogue_tokens = set(word_tokenize(dialogue.lower()))
    common_tokens = summary_tokens & dialogue_tokens
    coverage_score = len(common_tokens) / len(dialogue_tokens) if len(dialogue_tokens) != 0 else 0
    return coverage_score


In [16]:
from math import log
def fluency(generated_summary):
    tokens = word_tokenize(generated_summary.lower())
    token_freq = Counter(tokens)
    total_log_prob = sum(-log(token_freq[token] / len(tokens)) for token in token_freq)
    perplexity = exp(total_log_prob / len(tokens))
    return perplexity

In [17]:
evaluation_results_list = []

In [19]:
import nltk
from nltk.tokenize import word_tokenize

# Download the 'punkt' resource
nltk.download('punkt')



[nltk_data] Downloading package punkt to C:\Users\Abhinandan
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [20]:
from collections import Counter


In [23]:
for model_name in models:
    print(f"Evaluating model: {model_name}")
    summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)
    semantic_coherence_scores = []
    factual_accuracy_scores = []
    content_coverage_scores = []
    fluency_scores = []
    generated_summaries = []
    for index, row in tqdm(train_df.iterrows(), total=len(train_df)):  # Use tqdm to show progress
        dialogue = row['dialogue']
        summary = row['summary']
        generated_summary = summarizer(dialogue, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]
        generated_summaries.append(generated_summary)
        coherence_score = semantic_coherence(generated_summary, dialogue)
        semantic_coherence_scores.append(coherence_score)
        accuracy_score = factual_accuracy(generated_summary, summary)
        factual_accuracy_scores.append(accuracy_score)
        coverage_score = content_coverage(generated_summary, dialogue)
        content_coverage_scores.append(coverage_score)
        fluency_score = fluency(generated_summary)
        fluency_scores.append(fluency_score)
    evaluation_results_list.append({
        "Model": model_name,
        "Semantic Coherence": sum(semantic_coherence_scores) / len(semantic_coherence_scores),
        "Factual Accuracy": sum(factual_accuracy_scores) / len(factual_accuracy_scores),
        "Content Coverage": sum(content_coverage_scores) / len(content_coverage_scores),
        "Fluency": sum(fluency_scores) / len(fluency_scores)
    })
    print("="*50)

Evaluating model: facebook/bart-large-cnn


  3%|██▋                                                                               | 5/150 [00:36<18:50,  7.80s/it]Your max_length is set to 150, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  4%|███▎                                                                              | 6/150 [00:41<16:25,  6.84s/it]Your max_length is set to 150, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
  8%|██████▍                                                                          | 12/150 [01:18<14:23,  6.26s/it]Your max_length is set to 150, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are ty

 57%|██████████████████████████████████████████████▍                                  | 86/150 [10:37<09:02,  8.47s/it]Your max_length is set to 150, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
 59%|███████████████████████████████████████████████▌                                 | 88/150 [10:52<08:18,  8.03s/it]Your max_length is set to 150, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
 66%|█████████████████████████████████████████████████████▍                           | 99/150 [12:10<06:02,  7.10s/it]Your max_length is set to 150, but your input_length is only 132. Since this is a summarization task, where outputs shorter than the input are t

Evaluating model: t5-large


Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
  3%|██▋                                                                               | 5/150 [01:40<49:12, 20.36s/it]Your max_length is set to 150, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
  4%|███▎                                                                              | 6/150 [01:59<48:18, 20.13s/it]Your max_length is set to 150, but your input_length is only 91. Since this is a summarization task, where ou

 69%|███████████████████████████████████████████████████████▍                        | 104/150 [31:08<15:07, 19.73s/it]Your max_length is set to 150, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
 73%|██████████████████████████████████████████████████████████▋                     | 110/150 [32:47<11:38, 17.45s/it]Your max_length is set to 150, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
 75%|████████████████████████████████████████████████████████████▎                   | 113/150 [33:32<10:09, 16.48s/it]Your max_length is set to 150, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are t

Evaluating model: sshleifer/distilbart-cnn-12-6


Downloading config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

  3%|██▋                                                                               | 5/150 [00:42<21:38,  8.96s/it]Your max_length is set to 150, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  4%|███▎                                                                              | 6/150 [00:48<19:08,  7.98s/it]Your max_length is set to 150, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
  8%|██████▍                                                                          | 12/150 [01:29<16:54,  7.35s/it]Your max_length is set to 150, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are ty

 57%|██████████████████████████████████████████████▍                                  | 86/150 [11:27<09:05,  8.52s/it]Your max_length is set to 150, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
 59%|███████████████████████████████████████████████▌                                 | 88/150 [11:45<09:05,  8.79s/it]Your max_length is set to 150, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
 66%|█████████████████████████████████████████████████████▍                           | 99/150 [13:12<06:00,  7.07s/it]Your max_length is set to 150, but your input_length is only 132. Since this is a summarization task, where outputs shorter than the input are t

Evaluating model: google/pegasus-large


Downloading config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

  3%|██▋                                                                               | 5/150 [01:15<33:56, 14.05s/it]Your max_length is set to 150, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
  4%|███▎                                                                              | 6/150 [01:27<31:59, 13.33s/it]Your max_length is set to 150, but your input_length is only 82. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
  5%|████▎                                                                             | 8/150 [02:11<44:35, 18.84s/it]Your max_length is set to 150, but your input_length is only 146. Since this is a summarization task, where outputs shorter than the input are ty

 52%|██████████████████████████████████████████                                       | 78/150 [23:11<36:06, 30.09s/it]Your max_length is set to 150, but your input_length is only 74. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
 55%|████████████████████████████████████████████▎                                    | 82/150 [24:35<27:39, 24.41s/it]Your max_length is set to 150, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
 57%|██████████████████████████████████████████████▍                                  | 86/150 [25:43<20:33, 19.28s/it]Your max_length is set to 150, but your input_length is only 128. Since this is a summarization task, where outputs shorter than the input are ty

100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [42:32<00:00, 17.01s/it]


Evaluating model: allenai/led-large-16384-arxiv


Downloading config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFLEDForConditionalGeneration.

Some layers of TFLEDForConditionalGeneration were not initialized from the model checkpoint at allenai/led-large-16384-arxiv and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading generation_config.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

  2%|█▌                                                                              | 3/150 [04:19<3:29:16, 85.42s/it]



  3%|██▏                                                                             | 4/150 [05:42<3:25:55, 84.63s/it]



  3%|██▋                                                                             | 5/150 [07:05<3:22:32, 83.81s/it]Your max_length is set to 150, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  4%|███▏                                                                            | 6/150 [08:46<3:35:17, 89.71s/it]Your max_length is set to 150, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
  8%|██████▎                                                                        | 12/150 [17:54<3:33:55, 93.01s/it]Your max_length is set to 150, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are ty

 57%|████████████████████████████████████████████▏                                | 86/150 [2:11:43<1:28:57, 83.40s/it]Your max_length is set to 150, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
 59%|█████████████████████████████████████████████▏                               | 88/150 [2:14:30<1:26:32, 83.75s/it]Your max_length is set to 150, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
 66%|██████████████████████████████████████████████████▊                          | 99/150 [2:29:49<1:11:29, 84.12s/it]Your max_length is set to 150, but your input_length is only 132. Since this is a summarization task, where outputs shorter than the input are t

Evaluating model: sshleifer/bart-tiny-random


Downloading config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.11M [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

  3%|██▋                                                                               | 5/150 [00:03<01:39,  1.46it/s]Your max_length is set to 150, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  4%|███▎                                                                              | 6/150 [00:04<01:35,  1.50it/s]Your max_length is set to 150, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
  8%|██████▍                                                                          | 12/150 [00:07<01:23,  1.64it/s]Your max_length is set to 150, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are ty

 52%|██████████████████████████████████████████                                       | 78/150 [00:46<00:47,  1.53it/s]Your max_length is set to 150, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
 55%|████████████████████████████████████████████▎                                    | 82/150 [00:49<00:41,  1.65it/s]Your max_length is set to 150, but your input_length is only 85. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
 57%|██████████████████████████████████████████████▍                                  | 86/150 [00:51<00:40,  1.57it/s]Your max_length is set to 150, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are ty






In [24]:
print(len(evaluation_results_list))

7


In [25]:
result = pd.concat([pd.DataFrame(item, index=[0]) for item in evaluation_results_list], ignore_index=True)

In [26]:
result.to_csv("result.csv", index=False)

In [28]:
result.head(6)

Unnamed: 0,Model,Semantic Coherence,Factual Accuracy,Content Coverage,Fluency
0,facebook/bart-large-cnn,0.655644,0.458349,0.371606,13.81577
1,facebook/bart-large-cnn,0.655644,0.458349,0.371606,13.81577
2,t5-large,0.754954,0.457265,0.409083,15.751138
3,sshleifer/distilbart-cnn-12-6,0.707827,0.521906,0.484371,15.687915
4,google/pegasus-large,0.706821,0.420956,0.364438,11.779998
5,allenai/led-large-16384-arxiv,0.170011,0.29244,0.190508,3.732631
