In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from typing import Union
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer


class SemanticSimilarity:
    def __init__(self):
        self.device = 'cpu'  # for GPU usage or "cpu" for CPU usage
        self.checkpoint_110 = 'Salesforce/codet5p-110m-embedding'
        self.tokenizer_110 = AutoTokenizer.from_pretrained(
            self.checkpoint_110, trust_remote_code=True
        )
        self.model_110 = AutoModel.from_pretrained(
            self.checkpoint_110, trust_remote_code=True
        ).to(self.device)

    def _codet5_110_encode(self, comment):
        with torch.no_grad():
            inputs = self.tokenizer_110.encode(
                comment.lower(), return_tensors='pt'
            ).to(self.device)
            embed = self.model_110(inputs)[0]
            return embed.cpu().detach().numpy()

    def evaluate(self, orig: str, pred: str) -> float:
        return cosine_similarity(
            [self._codet5_110_encode(orig)], [self._codet5_110_encode(pred)]
        )[0][0]
    
def calc_test_score(
    train_sample: Union[str, list[str]],
    test_sample: list[Union[str, list[str]]],
) -> float:
    if train_sample == test_sample:
        return 1.0
    if not train_sample or not test_sample:
        return 0.0

    # Use metric BLEU-4 by default, for both train and test short samples
    # will switch to BLEU 1,2,3 according to max length.
    k = min(4, max(len(train_sample), len(test_sample)))
    weights = [1 / k] * k

    chencherry = SmoothingFunction().method1

    return sentence_bleu(
        [train_sample],
        test_sample,
        weights=weights,
        smoothing_function=chencherry,
    )

qwen = pd.DataFrame(columns=['BLEU1', 'BLEU4', 'METEOR', 'ROUGE', 'CodeT5'])
qwen_rag = pd.DataFrame(columns=['BLEU1', 'BLEU4', 'METEOR', 'ROUGE', 'CodeT5'])

weights_1 = (1.0, 0.0, 0.0, 0.0)


chencherry1= SmoothingFunction().method2
ss = SemanticSimilarity()
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


with open('qwen.txt', 'r') as f1, open('qwen_rag.txt', 'r') as f2, open('../db/test/test.answer', 'r') as f3:
        for q, rag, orig in zip(f1, f2, f3):
                candidate = q.split()
                candidate_rag = rag.split()
                orig_answer = [orig.split()]
                qwen.loc[len(qwen)] = [sentence_bleu(orig_answer, candidate, weights=weights_1, smoothing_function=chencherry1), 
                                       calc_test_score(orig.split(), candidate),
                                       meteor_score(orig_answer, candidate),
                                       scorer.score(orig, q)['rougeL'].fmeasure,
                                       ss.evaluate(orig, q)
                                       ]
                qwen_rag.loc[len(qwen_rag)] = [sentence_bleu(orig_answer, candidate_rag, weights=weights_1, smoothing_function=chencherry1), 
                                               calc_test_score(orig.split(), candidate_rag),
                                               meteor_score(orig_answer, candidate_rag),
                                               scorer.score(orig, rag)['rougeL'].fmeasure,
                                               ss.evaluate(orig, rag)
                                               ]
      

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
print("\033[1;34mОценка ответов модели с историческими документами:\033[0m")
qwen_rag

[1;34mОценка ответов модели с историческими документами:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
0,0.333333,0.118563,0.166667,0.333333,0.514133
1,1.000000,1.000000,0.500000,1.000000,1.000000
2,0.214879,0.053872,0.300654,0.461538,0.612273
3,0.333333,0.118563,0.416667,0.500000,0.691915
4,0.000000,0.000000,0.000000,0.000000,0.443645
...,...,...,...,...,...
6995,0.000000,0.000000,0.000000,0.000000,0.474705
6996,0.071626,0.023019,0.117647,0.166667,0.500418
6997,0.454898,0.114046,0.440613,0.600000,0.719742
6998,0.000000,0.000000,0.000000,0.000000,0.356952


In [28]:
print("\033[1;34mОценка ответов модели без исторических документов:\033[0m")
qwen


[1;34mОценка ответов модели без исторических документов:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
0,0.285714,0.039281,0.294118,0.400000,0.393607
1,0.000000,0.000000,0.000000,0.000000,0.240918
2,0.214708,0.029519,0.113636,0.250000,0.343694
3,0.000000,0.000000,0.000000,0.000000,0.514488
4,0.000000,0.000000,0.000000,0.000000,0.476445
...,...,...,...,...,...
6995,0.000000,0.000000,0.000000,0.000000,0.331294
6996,0.101088,0.024762,0.057471,0.142857,0.388581
6997,0.333333,0.086334,0.312500,0.500000,0.713088
6998,0.000000,0.000000,0.000000,0.222222,0.424041


In [29]:
print("\033[1;34mСтатистика оценок модели с историческими данными:\033[0m")
qwen_rag.describe()

[1;34mСтатистика оценок модели с историческими данными:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
count,7000.0,7000.0,7000.0,7000.0,7000.0
mean,0.206527,0.096941,0.217019,0.29976,0.587109
std,0.260934,0.215348,0.258277,0.280018,0.208894
min,0.0,0.0,0.0,0.0,-0.035789
25%,0.0,0.0,0.0,0.0,0.425908
50%,0.125,0.029556,0.12987,0.25,0.582511
75%,0.327492,0.080343,0.3125,0.444444,0.740054
max,1.0,1.0,0.999878,1.0,1.0


In [30]:
print("\033[1;34mСтатистика оценок модели без исторических данных:\033[0m")
qwen.describe()

[1;34mСтатистика оценок модели без исторических данных:[0m


Unnamed: 0,BLEU1,BLEU4,METEOR,ROUGE,CodeT5
count,7000.0,7000.0,7000.0,7000.0,7000.0
mean,0.077103,0.021449,0.097656,0.180907,0.457995
std,0.117983,0.037559,0.142474,0.18414,0.172687
min,0.0,0.0,0.0,0.0,-0.077598
25%,0.0,0.0,0.0,0.0,0.326716
50%,0.0,0.0,0.048077,0.181818,0.451516
75%,0.163746,0.040825,0.151515,0.285714,0.590148
max,1.0,1.0,0.949821,1.0,1.0


In [11]:
import pandas as pd
sum1 , sum2, sum3, sum4 = 0, 0, 0, 0
with open("qwen_coder/human_new.txt") as f1, open("qwen_coder/human_rag_new.txt") as f2, open("qwen/human_new.txt") as f3, open("qwen/human_rag_new.txt") as f4:
    for s1, s2, s3, s4 in zip(f1, f2, f3, f4):
        sum1 += float(s1.strip())
        sum2 += float(s2.strip())
        sum3 += float(s3.strip())
        sum4 += float(s4.strip())

results_qwen_coder = pd.DataFrame(columns=['qwen_coder', 'qwen_coder_rag', 'qwen', 'qwen_coder'])
results_qwen_coder.loc[len(results_qwen_coder)] = [sum1/108,
                                                   sum2/108, 
                                                   sum3/108,
                                                   sum4/108]
results_qwen_coder

Unnamed: 0,qwen_coder,qwen_coder_rag,qwen,qwen_coder.1
0,0.314815,0.49537,0.337963,0.527778


In [9]:
total_lines = 0
total_words = 0

with open('db/train/train.code') as f:
    for line in f:
        total_words += len(line.strip().split())
        total_lines += 1

total_words / total_lines


49.85065525541589