In [1]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider

In [2]:
df1 = pd.read_csv('output_Qwen.csv')
df2 = pd.read_csv('test_df.csv')

In [3]:
df1 = df1['content'].tolist()
df2 = df2['content'].tolist()

In [4]:
df1 = df1[:500]
len(df1)

500

In [5]:
df2 = df2[:500]
len(df2)

500

In [6]:
def calculate_bleu(reference, candidate):
    smooth = SmoothingFunction().method4
    bleu_1 = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0), smoothing_function=smooth)
    bleu_2 = sentence_bleu([reference], candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
    bleu_3 = sentence_bleu([reference], candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth)
    bleu_4 = sentence_bleu([reference], candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
    return bleu_1, bleu_2, bleu_3, bleu_4

rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
cider_scorer = Cider()

In [7]:
bleu_scores = []
rouge_scores = []
cider_scores = []

In [8]:
for ref, cand in zip(df1, df2):
    # Tokenize reference and candidate texts
    ref_tokens = ref.split()
    cand_tokens = cand.split()

    # Calculate BLEU scores
    bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu(ref_tokens, cand_tokens)
    bleu_scores.append((bleu_1, bleu_2, bleu_3, bleu_4))

    # Calculate ROUGE scores
    rouge_scores_sample = rouge_scorer_obj.score(ref, cand)
    rouge_scores.append(rouge_scores_sample)

    # Calculate CIDEr scores (CIDEr works with the format {idx: [text]})
    cider_score, _ = cider_scorer.compute_score({0: [ref]}, {0: [cand]})
    cider_scores.append(cider_score)

In [9]:
for i in range(len(df1)):
    print(f"Sample {i+1}:")
    print(f"BLEU-1: {bleu_scores[i][0]:.4f}, BLEU-2: {bleu_scores[i][1]:.4f}, BLEU-3: {bleu_scores[i][2]:.4f}, BLEU-4: {bleu_scores[i][3]:.4f}")
    print(f"ROUGE-1: {rouge_scores[i]['rouge1'].fmeasure:.4f}, ROUGE-2: {rouge_scores[i]['rouge2'].fmeasure:.4f}, ROUGE-L: {rouge_scores[i]['rougeL'].fmeasure:.4f}")
    print(f"CIDEr: {cider_scores[i]:.4f}\n")

Sample 1:
BLEU-1: 0.3951, BLEU-2: 0.3951, BLEU-3: 0.3951, BLEU-4: 0.3951
ROUGE-1: 0.6977, ROUGE-2: 0.6829, ROUGE-L: 0.6977
CIDEr: 0.0000

Sample 2:
BLEU-1: 0.6575, BLEU-2: 0.6575, BLEU-3: 0.6575, BLEU-4: 0.6575
ROUGE-1: 0.8169, ROUGE-2: 0.8116, ROUGE-L: 0.8169
CIDEr: 0.0000

Sample 3:
BLEU-1: 0.4204, BLEU-2: 0.4204, BLEU-3: 0.4204, BLEU-4: 0.4204
ROUGE-1: 0.6977, ROUGE-2: 0.6829, ROUGE-L: 0.6977
CIDEr: 0.0000

Sample 4:
BLEU-1: 0.5220, BLEU-2: 0.5220, BLEU-3: 0.5220, BLEU-4: 0.5220
ROUGE-1: 0.7547, ROUGE-2: 0.7451, ROUGE-L: 0.7547
CIDEr: 0.0000

Sample 5:
BLEU-1: 0.6286, BLEU-2: 0.6286, BLEU-3: 0.6286, BLEU-4: 0.6286
ROUGE-1: 0.8000, ROUGE-2: 0.7937, ROUGE-L: 0.8000
CIDEr: 0.0000

Sample 6:
BLEU-1: 0.6179, BLEU-2: 0.6179, BLEU-3: 0.6179, BLEU-4: 0.6179
ROUGE-1: 0.8000, ROUGE-2: 0.7937, ROUGE-L: 0.8000
CIDEr: 0.0000

Sample 7:
BLEU-1: 0.3385, BLEU-2: 0.3385, BLEU-3: 0.3385, BLEU-4: 0.3385
ROUGE-1: 0.6486, ROUGE-2: 0.6286, ROUGE-L: 0.6486
CIDEr: 0.0000

Sample 8:
BLEU-1: 0.3067, BLEU-2: 

In [10]:
total_bleu = [0, 0, 0, 0]  # BLEU-1 to BLEU-4
total_rouge = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
total_cider = 0

In [11]:
for ref, cand in zip(df1, df2):
    
    ref_tokens = ref.split()
    cand_tokens = cand.split()

    # Calculate BLEU scores
    bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu(ref_tokens, cand_tokens)
    total_bleu[0] += bleu_1
    total_bleu[1] += bleu_2
    total_bleu[2] += bleu_3
    total_bleu[3] += bleu_4

    # Calculate ROUGE scores
    rouge_scores_sample = rouge_scorer_obj.score(ref, cand)
    total_rouge['rouge1'] += rouge_scores_sample['rouge1'].fmeasure
    total_rouge['rouge2'] += rouge_scores_sample['rouge2'].fmeasure
    total_rouge['rougeL'] += rouge_scores_sample['rougeL'].fmeasure

    # Calculate CIDEr scores
    cider_score, _ = cider_scorer.compute_score({0: [ref]}, {0: [cand]})
    total_cider += cider_score

In [12]:
num_samples = len(df1)

# Compute average BLEU, ROUGE, and CIDEr scores
average_bleu = [score / num_samples for score in total_bleu]
average_rouge = {key: score / num_samples for key, score in total_rouge.items()}
average_cider = total_cider / num_samples

print(f"Average BLEU-1: {average_bleu[0]:.4f}, BLEU-2: {average_bleu[1]:.4f}, BLEU-3: {average_bleu[2]:.4f}, BLEU-4: {average_bleu[3]:.4f}")
print(f"Average ROUGE-1: {average_rouge['rouge1']:.4f}, ROUGE-2: {average_rouge['rouge2']:.4f}, ROUGE-L: {average_rouge['rougeL']:.4f}")
print(f"Average CIDEr: {average_cider:.4f}")

Average BLEU-1: 0.1364, BLEU-2: 0.1123, BLEU-3: 0.1033, BLEU-4: 0.0984
Average ROUGE-1: 0.2192, ROUGE-2: 0.1494, ROUGE-L: 0.2106
Average CIDEr: 0.0000
