In [2]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from typing import Union
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer


class SemanticSimilarity:
    def __init__(self):
        self.device = 'cpu'  # for GPU usage or "cpu" for CPU usage
        self.checkpoint_110 = 'Salesforce/codet5p-110m-embedding'
        self.tokenizer_110 = AutoTokenizer.from_pretrained(
            self.checkpoint_110, trust_remote_code=True
        )
        self.model_110 = AutoModel.from_pretrained(
            self.checkpoint_110, trust_remote_code=True
        ).to(self.device)

    def _codet5_110_encode(self, comment):
        with torch.no_grad():
            inputs = self.tokenizer_110.encode(
                comment.lower(), return_tensors='pt'
            ).to(self.device)
            embed = self.model_110(inputs)[0]
            return embed.cpu().detach().numpy()

    def evaluate(self, orig: str, pred: str) -> float:
        return cosine_similarity(
            [self._codet5_110_encode(orig)], [self._codet5_110_encode(pred)]
        )[0][0]
    
def calc_test_score(
    train_sample: Union[str, list[str]],
    test_sample: list[Union[str, list[str]]],
) -> float:
    if train_sample == test_sample:
        return 1.0
    if not train_sample or not test_sample:
        return 0.0

    # Use metric BLEU-4 by default, for both train and test short samples
    # will switch to BLEU 1,2,3 according to max length.
    k = min(4, max(len(train_sample), len(test_sample)))
    weights = [1 / k] * k

    chencherry = SmoothingFunction().method1

    return sentence_bleu(
        [train_sample],
        test_sample,
        weights=weights,
        smoothing_function=chencherry,
    )

qwen = pd.DataFrame(columns=['BLEU1', 'BLEU4', 'METEOR', 'ROUGE', 'CodeT5'])
qwen_rag = pd.DataFrame(columns=['BLEU1', 'BLEU4', 'METEOR', 'ROUGE', 'CodeT5'])

weights_1 = (1.0, 0.0, 0.0, 0.0)


chencherry1= SmoothingFunction().method2
ss = SemanticSimilarity()
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


with open('qwen.txt', 'r') as f1, open('qwen_rag.txt', 'r') as f2, open('/home/alina/QA_rag/db/test/test.answer', 'r') as f3:
        for q, rag, orig in zip(f1, f2, f3):
                candidate = q.split()
                candidate_rag = rag.split()
                orig_answer = [orig.split()]
                qwen.loc[len(qwen)] = [sentence_bleu(orig_answer, candidate, weights=weights_1, smoothing_function=chencherry1), 
                                       calc_test_score(orig.split(), candidate),
                                       meteor_score(orig_answer, candidate),
                                       scorer.score(orig, q)['rougeL'].fmeasure,
                                       ss.evaluate(orig, q)
                                       ]
                qwen_rag.loc[len(qwen_rag)] = [sentence_bleu(orig_answer, candidate_rag, weights=weights_1, smoothing_function=chencherry1), 
                                               calc_test_score(orig.split(), candidate_rag),
                                               meteor_score(orig_answer, candidate_rag),
                                               scorer.score(orig, rag)['rougeL'].fmeasure,
                                               ss.evaluate(orig, rag)
                                               ]
      

2025-04-01 15:14:37.775309: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743509677.829839   31306 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743509677.846143   31306 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-01 15:14:37.968632: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print("\033[1;34mОценка ответов модели с историческими документами:\033[0m")
qwen_rag

[1;34mОценка ответов модели с историческими документами:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
0,0.333333,0.118563,0.166667,0.333333,0.514133
1,1.000000,1.000000,0.500000,1.000000,1.000000
2,0.214879,0.053872,0.300654,0.461538,0.612273
3,0.333333,0.118563,0.416667,0.500000,0.691915
4,0.000000,0.000000,0.000000,0.000000,0.443645
...,...,...,...,...,...
6995,0.000000,0.000000,0.000000,0.000000,0.474705
6996,0.071626,0.023019,0.117647,0.166667,0.500418
6997,0.454898,0.114046,0.440613,0.600000,0.719742
6998,0.000000,0.000000,0.000000,0.000000,0.356952


In [4]:
print("\033[1;34mОценка ответов модели без исторических документов:\033[0m")
qwen


[1;34mОценка ответов модели без исторических документов:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
0,0.222222,0.028518,0.277778,0.307692,0.337235
1,0.250000,0.080343,0.384615,0.400000,0.631930
2,0.330936,0.057368,0.287141,0.352941,0.435940
3,0.000000,0.000000,0.000000,0.000000,0.540546
4,0.000000,0.000000,0.000000,0.000000,0.340235
...,...,...,...,...,...
6995,0.000000,0.000000,0.000000,0.000000,0.347228
6996,0.214708,0.052493,0.213068,0.266667,0.394732
6997,0.285714,0.069853,0.307377,0.461538,0.679218
6998,0.000000,0.000000,0.000000,0.133333,0.425517


In [5]:
print("\033[1;34mСтатистика оценок модели с историческими данными:\033[0m")
qwen_rag.describe()

[1;34mСтатистика оценок модели с историческими данными:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
count,7000.0,7000.0,7000.0,7000.0,7000.0
mean,0.206527,0.096941,0.217019,0.29976,0.587109
std,0.260934,0.215348,0.258277,0.280018,0.208894
min,0.0,0.0,0.0,0.0,-0.035789
25%,0.0,0.0,0.0,0.0,0.425908
50%,0.125,0.029556,0.12987,0.25,0.582511
75%,0.327492,0.080343,0.3125,0.444444,0.740054
max,1.0,1.0,0.999878,1.0,1.0


In [6]:
print("\033[1;34mСтатистика оценок модели без исторических данных:\033[0m")
qwen.describe()

[1;34mСтатистика оценок модели без исторических данных:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
count,7000.0,7000.0,7000.0,7000.0,7000.0
mean,0.087958,0.020788,0.118932,0.174506,0.459098
std,0.116527,0.041311,0.15491,0.165689,0.170113
min,0.0,0.0,0.0,0.0,-0.012936
25%,0.0,0.0,0.0,0.0,0.331959
50%,0.0,0.0,0.080645,0.166667,0.456088
75%,0.161348,0.033032,0.16129,0.266667,0.586323
max,1.0,1.0,0.920139,1.0,1.0


In [14]:
(qwen_rag - qwen)['CodeT5'].sort_values(ascending=False)

5638    0.929009
1708    0.891151
4436    0.872721
873     0.855174
2597    0.851788
          ...   
1863   -0.459093
4189   -0.480042
5947   -0.481264
2548   -0.562442
4759   -0.588028
Name: CodeT5, Length: 7000, dtype: float64

In [15]:
qwen.to_csv('qwen.csv')

In [16]:
qwen_rag.to_csv('qwen_rag.csv')