In [1]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import json
import warnings
warnings.filterwarnings('ignore')
import openai
from dotenv import load_dotenv
import re

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
model = CrossEncoder('cross-encoder/stsb-roberta-large')
scores = model.predict([('Sentence 1', 'Sentence 2'), ('Sentence 3', 'Sentence 4')])

In [31]:
float(scores.mean())

0.2989317774772644

In [32]:
df = pd.read_csv('../cleaning_before_eval/ready_to_eval.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   No                            45 non-null     float64
 1   Question                      45 non-null     object 
 2   Dr Answer                     45 non-null     object 
 3   Claude Answer                 45 non-null     object 
 4   Qwen Answer                   45 non-null     object 
 5   GPT Answer                    45 non-null     object 
 6   Deepseek RAG Answer           45 non-null     object 
 7   Deepseek non RAG Answer       45 non-null     object 
 8   Claude Full Answer            45 non-null     object 
 9   Qwen Full Answer              45 non-null     object 
 10  GPT Full Answer               45 non-null     object 
 11  Deepseek RAG Full Answer      45 non-null     object 
 12  Deepseek non RAG Full Answer  45 non-null     object 
dtypes: floa

In [33]:
pairs = list(zip(df["Dr Answer"], df["Claude Answer"]))
scores = model.predict(pairs)
scores

array([0.40268138, 0.6366263 , 0.13310033, 0.5421202 , 0.57738245,
       0.51936626, 0.45530125, 0.5033681 , 0.61890996, 0.5909227 ,
       0.5692893 , 0.6702422 , 0.69856787, 0.5470521 , 0.5225401 ,
       0.4835015 , 0.5657898 , 0.54025793, 0.52245015, 0.34361047,
       0.5702888 , 0.38537365, 0.52368283, 0.6299555 , 0.65865874,
       0.51854783, 0.69758904, 0.7292401 , 0.7960828 , 0.7089312 ,
       0.6012783 , 0.5151739 , 0.5331848 , 0.48950094, 0.53025293,
       0.57143956, 0.7225585 , 0.6597639 , 0.5142269 , 0.55042654,
       0.6901827 , 0.6509341 , 0.58385205, 0.6790853 , 0.5700344 ],
      dtype=float32)

In [34]:
with open('kamus.json', 'r') as file:
    SYNONYM_MAP = json.load(file)
    
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    for abbr, canonical in SYNONYM_MAP.items():
        abb = abbr.lower()
        canon = canonical.lower()

        if (text == canon): break
        pattern = rf'\b{re.escape(abb)}\b'
        if re.search(pattern, text):
            text = re.sub(pattern, canon, text)
            break
        
    return text

In [37]:
def process_row(row):
    print(f"Processing row {row.name}...")
    truth_list = [normalize_text(item) for item in row['Dr Answer'].split(',')]
    model_cols = {
        'Claude Answer': 'SAS_Claude',
        'Qwen Answer': 'SAS_Qwen',
        'GPT Answer': 'SAS_GPT',
        'Deepseek RAG Answer': 'SAS_Deepseek_RAG',
        'Deepseek non RAG Answer': 'SAS_Deepseek_nonRAG'
    }
    
    for col_name, result_col in model_cols.items():
        model_list = [normalize_text(item) for item in row[col_name].split(',')]
        pairs = [(gt, stu) for gt in truth_list for stu in model_list]
        scores = model.predict(pairs)

        best_scores = []
        for gt in truth_list:
            matches = [score for (g, s), score in zip(pairs, scores) if g == gt]
            best_scores.append(max(matches))  

        final_score = sum(best_scores) / len(best_scores)
        row[f'{result_col}_cross_score'] = final_score
    
    return row

In [38]:
df = df.apply(process_row, axis=1)

Processing row 0...
Processing row 1...
Processing row 2...
Processing row 3...
Processing row 4...
Processing row 5...
Processing row 6...
Processing row 7...
Processing row 8...
Processing row 9...
Processing row 10...
Processing row 11...
Processing row 12...
Processing row 13...
Processing row 14...
Processing row 15...
Processing row 16...
Processing row 17...
Processing row 18...
Processing row 19...
Processing row 20...
Processing row 21...
Processing row 22...
Processing row 23...
Processing row 24...
Processing row 25...
Processing row 26...
Processing row 27...
Processing row 28...
Processing row 29...
Processing row 30...
Processing row 31...
Processing row 32...
Processing row 33...
Processing row 34...
Processing row 35...
Processing row 36...
Processing row 37...
Processing row 38...
Processing row 39...
Processing row 40...
Processing row 41...
Processing row 42...
Processing row 43...
Processing row 44...


In [40]:
df.describe()

Unnamed: 0,No,SAS_Claude_cross_score,SAS_Qwen_cross_score,SAS_GPT_cross_score,SAS_Deepseek_RAG_cross_score,SAS_Deepseek_nonRAG_cross_score
count,45.0,45.0,45.0,45.0,45.0,45.0
mean,23.0,0.962427,0.493652,0.688204,0.653058,0.413882
std,13.133926,0.019857,0.296511,0.286107,0.255227,0.249453
min,1.0,0.888037,0.01024,0.010054,0.009949,0.009609
25%,12.0,0.965232,0.271071,0.490747,0.527456,0.222673
50%,23.0,0.96901,0.490011,0.766854,0.648549,0.386009
75%,34.0,0.970355,0.770407,0.964862,0.836693,0.60074
max,45.0,0.971509,0.970979,0.970979,0.969833,0.970073
