## Deepseek R1 qwen3-8b

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import pandas as pd

the model used

In [None]:
model_name = "unsloth/DeepSeek-R1-0528-Qwen3-8B-bnb-4bit"
device = "cuda" 
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-0528-Qwen3-8B")
model.to(device)

an example of one translation

In [None]:
prompt = "Marco Cornelio ch'era de' dieci compagni, studiosamente  si riservò di parlare all'ultimo."

messages = [
        {"role": "system", "content": "you are a translator from old italian to model italian. you take a sentence in old italian and you answer only with: La traduzione è:<translation> . Don't add anything else. Translate only in italian"},
        {"role": "system", "content": "only use italian and no other language in the translation"},
        {"role": "user", "content": prompt} 
    ]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=5000,
    do_sample=True, 
    temperature=0.7,
    top_p=0.9,
)


new_tokens = generated_ids[0][len(model_inputs[0]):]
decoded = tokenizer.decode(new_tokens, skip_special_tokens=True)
decoded = decoded.split("La traduzione è:")[-1].strip()
print(decoded)

the code to translate all the database

In [None]:
# Read the CSV file
df = pd.read_csv('dataset_cleaned.csv')

# Create new column for translations
df['Deepseek R1 qwen-8b'] = ''

# Process each row
for idx, row in df.iterrows():
    # Create messages with current prompt
    messages = [
        {"role": "system", "content": "you are a translator from old italian to model italian. you take a sentence in old italian and you answer only with: La traduzione è:<translation> . Don't add anything else. Translate only in italian"},
        {"role": "system", "content": "only use italian and no other language in the translation"},
        {"role": "user", "content": row['Sentence']} 
    ]
    
    # Prepare input
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate translation
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=10000,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )

    # Decode and clean up response
    new_tokens = generated_ids[0][len(model_inputs[0]):]
    decoded = tokenizer.decode(new_tokens, skip_special_tokens=True)
    translation = decoded.split("La traduzione è:")[-1].strip()    
    print(translation)
    
    # Store translation
    df.at[idx, 'Deepseek R1 qwen-8b'] = translation

# Save updated dataframe to dataset_deepseek.csv
df.to_csv('./dataset/dataset_deepseek.csv', index=False)


cleaning

In [3]:
#clean the vram memory
torch.cuda.empty_cache()
del model
del tokenizer

## Mistral Mistral-7B-Instruct-v0.3

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import pandas as pd

the model used

In [None]:

device = "cuda" 
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
model.to(device)

one translation

In [None]:
prompt = "la moltitudine de' quali tu ài potuto vedere e riguardare lo studio e poco dinanzi udire le voci, e lle cui mani e lance apena posso ritenere."

messages = [
        {"role": "system", "content": "Sei un traduttore dall'italiano antico all'italiano moderno. Traduci una frase in italiano moderno e rispondi solo con: La traduzione è:<traduzione>. Non aggiungere altro pena la morte. Usa solo l'italiano e nessun'altra lingua nella traduzione."},
        {"role": "user", "content": prompt} 
    ]

tokens = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
attention_mask = torch.ones_like(tokens)

# Move to device
model_inputs1 = tokens.to(device)
attention_mask = attention_mask.to(device)

# Generate with attention mask
generated_ids1 = model.generate(
    model_inputs1, 
    attention_mask=attention_mask,
    max_new_tokens=1000, 
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.7,
    top_p=0.9
)

# Decode only the new tokens (exclude the input)
new_tokens = generated_ids1[0][len(tokens[0]):]
decoded = tokenizer.decode(new_tokens, skip_special_tokens=True)
print(decoded)


code for all database

In [None]:
df = pd.read_csv('dataset_cleaned.csv')
df['mistral'] = ''

# Process each row
for idx, row in df.iterrows():
    # Create messages with current prompt
    messages = [
        {"role": "system", "content": "Sei un traduttore dall'italiano antico all'italiano moderno. Traduci una frase in italiano moderno e rispondi solo con: La traduzione è:<traduzione>. Non aggiungere altro pena la morte. Usa solo l'italiano e nessun'altra lingua nella traduzione."},
        {"role": "user", "content": row['Sentence']} 
    ]

    tokens = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
    attention_mask = torch.ones_like(tokens)

    # Move to device
    model_inputs1 = tokens.to(device)
    attention_mask = attention_mask.to(device)

    # Generate with attention mask
    generated_ids1 = model.generate(
        model_inputs1, 
        attention_mask=attention_mask,
        max_new_tokens=1000, 
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_p=0.9
    )
    new_tokens = generated_ids1[0][len(tokens[0]):]
    decoded = tokenizer.decode(new_tokens, skip_special_tokens=True)
    translation = decoded.split("La traduzione è:")[-1].strip()
    #remove any part between ( and  )
    translation = translation.split('(')[0].strip()
    #remove anything after the \n character
    translation = translation.split('\n')[0].strip() 
    print(translation)
    
    # Store translation
    df.at[idx, 'Mistral 7b-instruction'] = translation

# Save updated dataframe to dataset_deepseek.csv
df.to_csv('./dataset/dataset_mistral.csv', index=False)


In [23]:
#clean the vram memory
torch.cuda.empty_cache()
del model
del tokenizer

## Prometeus

In [2]:
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import pandas
from tqdm import tqdm
import torch

In [None]:
device = 'cuda'
model_name = "prometheus-eval/prometheus-7b-v2.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16,
                                            offload_folder="offload_prometheus", offload_buffers=True)

In [None]:
dataset_model_mistral = pandas.read_csv('dataset/dataset_mistral.csv')
dataset_model_deepseek = pandas.read_csv('dataset/dataset_deepseek.csv')
dataset_model_qwen = pandas.read_csv('dataset/dataset_qwen.csv')
dataset_golden = pandas.read_csv('dataset/dataset_goldenLabel.csv')

In [13]:
def create_instruction(sentence_to_translate, response, golden_label):
    instruction = f"Translate the following archaic Italian sentence into modern Italian: {sentence_to_translate}",
    response = f"{response}"
    reference_answer = f"{golden_label}",
    
    rubric_data = {
      "criteria": "Archaic to Modern Italian Translation Quality",
      "score1_description": "The translation is not provided, is unintelligible, or has completely lost the essential meaning of the original sentence.",
      "score2_description": "The translation is difficult to understand. It contains major errors (in grammar, logic, or word choice) that significantly obscure or partially change the original meaning.",
      "score3_description": "The translation preserves the core meaning and is understandable, but contains noticeable flaws. The sentence may sound unnatural or awkward due to literal translations of archaic structures, or have minor grammatical errors that don't obscure the meaning.",
      "score4_description": "The translation is grammatically correct, accurately preserves the original meaning, and reads as natural, fluent modern Italian.",
      "score5_description": "The translation meets all criteria for a Score 4 (it is accurate, correct, and fluent) AND it also successfully captures the original author's subtle style, tone, and nuance."
    }
    
    score_rubric = SCORE_RUBRIC_TEMPLATE.format(**rubric_data)
    
    ABS_SYSTEM_PROMPT = "You are a helpful assistant that grades the quality of responses to user instructions. You will be given an instruction, a response, and a rubric. Your task is to assign a score based on the rubric. Your should STRICTLY give ALWAYS the score in this style --> [Score: ]."
    ABSOLUTE_PROMPT = f"Instruction: {{instruction}}\n\nResponse: {{response}}\n\nRubric: {{rubric}}\n\nReference Answer: {{reference_answer}}\n\nFeedback:"
    user_content = ABSOLUTE_PROMPT.format(
        instruction=instruction,
        response=response,
        rubric=score_rubric,
        reference_answer=reference_answer[0]
    )
    
    messages = [
        {"role": "system", "content": ABS_SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]

    return messages

def extract_score(text):
    """
    Extract score from feedback text using regex
    """
    pattern = r'\[Score:\s*(\d+)\]'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return int(match.group(1))
    
    return None

In [11]:
length = len(dataset_golden)
list_of_messages_m = []
list_of_messages_d = []
list_of_messages_q = []
for i in range(0, length):
    sentence_to_translate = dataset_golden["Sentence"][i]
    response_mistral = dataset_model_mistral["Mistral 7b-instruction"][i]
    response_deepseek = dataset_model_deepseek["Deepseek R1 qwen-8b"][i]
    response_qwen = dataset_model_qwen["Qwen3-32b"][i]
    golden_label = dataset_golden["goldenLabel"][i]
    message_mistral = create_instruction(sentence_to_translate, response_mistral, golden_label)
    message_deepseek = create_instruction(sentence_to_translate, response_deepseek, golden_label)
    message_qwen = create_instruction(sentence_to_translate, response_qwen, golden_label)
    list_of_messages_m.append(message_mistral)
    list_of_messages_d.append(message_deepseek)
    list_of_messages_q.append(message_qwen)

### Evaluating Mistral

In [None]:
all_outputs_m = []
all_scores_m = []

print("Starting evaluation for Mistral 7B-Instruct")
model.to(device)
pbar = tqdm(list_of_messages_m, desc="Processing data")

for msg in pbar:
    encodeds = tokenizer.apply_chat_template(msg, return_tensors="pt", return_dict = True)
    model_inputs = encodeds['input_ids'].to(device)
    attention_mask = encodeds['attention_mask'].to(device)
    generated_ids = model.generate(model_inputs, max_new_tokens=500, attention_mask=attention_mask, do_sample=False, pad_token_id=tokenizer.pad_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    all_outputs_m.append(decoded[0])
    output = extract_score(decoded[0])
    all_scores_m.append(output)
    if all_scores_m:
        avg_score = sum(all_scores_m) / len(all_scores_m)
        pbar.set_postfix(last_score=output, avg_score=f'{avg_score:.2f}')
print(all_scores_m)

### Evaluating Deepseek

In [None]:
all_outputs_d = []
all_scores_d = []
model.to(device)

print("Starting evaluation for Deepseek R1 Qwen-8B")
pbar = tqdm(list_of_messages_d, desc="Processing data")
for msg in pbar:
    encodeds = tokenizer.apply_chat_template(msg, return_tensors="pt", return_dict = True)
    model_inputs = encodeds['input_ids'].to(device)
    attention_mask = encodeds['attention_mask'].to(device)
    generated_ids = model.generate(model_inputs, max_new_tokens=500, attention_mask=attention_mask, do_sample=False, pad_token_id=tokenizer.pad_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    all_outputs_d.append(decoded[0])
    output = extract_score(decoded[0])
    all_scores_d.append(output)
    if all_scores_d:
        avg_score = sum(all_scores_d) / len(all_scores_d)
        pbar.set_postfix(last_score=output, avg_score=f'{avg_score:.2f}')
print(all_scores_d)

### Evaluating Qwen

In [None]:
all_outputs_q = []
all_scores_q = []
model.to(device)

print("Starting evaluation for Qwen3")
pbar = tqdm(list_of_messages_q, desc="Processing data")
for msg in pbar:
    encodeds = tokenizer.apply_chat_template(msg, return_tensors="pt", return_dict = True)
    model_inputs = encodeds['input_ids'].to(device)
    attention_mask = encodeds['attention_mask'].to(device)
    generated_ids = model.generate(model_inputs, max_new_tokens=500, attention_mask=attention_mask, do_sample=False, pad_token_id=tokenizer.pad_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    all_outputs_q.append(decoded[0])
    output = extract_score(decoded[0])
    all_scores_q.append(output)
    if all_scores_q:
        avg_score = sum(all_scores_q) / len(all_scores_q)
        pbar.set_postfix(last_score=output, avg_score=f'{avg_score:.2f}')
print(all_scores_q)

### Cleaning cache

In [15]:
import torch
del tokenizer
del model
torch.cuda.empty_cache()

### Save results

#### Mistral

In [None]:
dataset_evaluated_mistral = dataset_model_mistral.copy()
dataset_evaluated_mistral['p_mistral_vote'] = all_scores_m
dataset_evaluated_mistral['golden_label'] = dataset_golden["goldenLabel"]
dataset_evaluated_mistral.info()
# Save the evaluated dataset
dataset_evaluated_mistral.to_csv('./dataset_evaluated_mistral.csv', index=False)

#### Deepseek

In [None]:
dataset_evaluated_deepseek = dataset_model_deepseek.copy()
dataset_evaluated_deepseek['p_deepseek_vote'] = all_scores_d
dataset_evaluated_deepseek['golden_label'] = dataset_golden["goldenLabel"]
dataset_evaluated_deepseek.info()
# Save the evaluated dataset
dataset_evaluated_deepseek.to_csv('./dataset_evaluated_deepseek.csv', index=False)

#### Qwen

In [None]:
dataset_evaluated_qwen = dataset_model_qwen.copy()
dataset_evaluated_qwen['p_qwen_vote'] = all_scores_q
dataset_evaluated_qwen['golden_label'] = dataset_golden["goldenLabel"]
dataset_evaluated_qwen.info()
# Save the evaluated dataset
dataset_evaluated_qwen.to_csv('./dataset_evaluated_qwen.csv', index=False)

## Metrics

In [18]:
import numpy as np
from sklearn.metrics import cohen_kappa_score
import pandas as pd
from scipy.stats import spearmanr

manual = pd.read_csv('dataset/dataset_sub_20_m_eval.csv')
mistral = pd.read_csv('dataset/dataset_evaluated_mistral.csv')
deepseek = pd.read_csv('dataset/dataset_evaluated_deepseek.csv')
qwen = pd.read_csv('dataset/dataset_evaluated_qwen.csv')

# select only the same sentence present in both mistral/deepseek and manual
mistral_sub = mistral[mistral['golden_label'].isin(manual['goldenLabel'])]
deepseek_sub = deepseek[deepseek['golden_label'].isin(manual['goldenLabel'])]
qwen_sub = qwen[qwen['golden_label'].isin(manual['goldenLabel'])]

# order the mistral_sub, deepseek_sub and manual by golden_label
mistral_sub = mistral_sub.sort_values(by='golden_label').reset_index(drop=True)
deepseek_sub = deepseek_sub.sort_values(by='golden_label').reset_index(drop=True)
qwen_sub = qwen_sub.sort_values(by='golden_label').reset_index(drop=True)
manual = manual.sort_values(by='goldenLabel').reset_index(drop=True)

# get the scores
mistral_scores = mistral_sub['p_mistral_vote'].values
deepseek_scores = deepseek_sub['p_deepseek_vote'].values
qwen_scores = qwen_sub['p_qwen_vote'].values
manual_scores_d = manual['deepseek_vote'].values
manual_scores_m = manual['mistral_vote'].values
manual_scores_q = manual['qwen_vote'].values

In [19]:
# Calculate Cohen's Kappa for Mistral
kappa_mistral = cohen_kappa_score(manual_scores_m, mistral_scores)
# Calculate Cohen's Kappa for Deepseek
kappa_deepseek = cohen_kappa_score(manual_scores_d, deepseek_scores)
# Calculate Cohen's Kappa for Qwen
kappa_qwen = cohen_kappa_score(manual_scores_q, qwen_scores)

# Calculate Spearman's correlation for Mistral
spearman_mistral = spearmanr(manual_scores_m, mistral_scores).correlation
# Calculate Spearman's correlation for Deepseek
spearman_deepseek = spearmanr(manual_scores_d, deepseek_scores).correlation
# Calculate Spearman's correlation for Qwen
spearman_qwen = spearmanr(manual_scores_q, qwen_scores).correlation

# Print the results
print(f"Cohen's Kappa for Mistral: {kappa_mistral:.4f}")
print(f"Cohen's Kappa for Deepseek: {kappa_deepseek:.4f}")
print(f"Cohen's Kappa for Qwen: {kappa_qwen:.4f}")
print(f"Spearman's correlation for Mistral: {spearman_mistral:.4f}")
print(f"Spearman's correlation for Deepseek: {spearman_deepseek:.4f}")
print(f"Spearman's correlation for Qwen: {spearman_qwen:.4f}")

Cohen's Kappa for Mistral: 0.0058
Cohen's Kappa for Deepseek: 0.2691
Cohen's Kappa for Qwen: -0.0332
Spearman's correlation for Mistral: 0.2134
Spearman's correlation for Deepseek: 0.1781
Spearman's correlation for Qwen: -0.1307


In [20]:
# calculate the MSE for mistral and deepseek
from sklearn.metrics import mean_squared_error
mse_mistral = mean_squared_error(manual_scores_m, mistral_scores)
mse_deepseek = mean_squared_error(manual_scores_d, deepseek_scores)
mse_qwen = mean_squared_error(manual_scores_q, qwen_scores)
print(f"Mean Squared Error for Mistral: {mse_mistral:.4f}")
print(f"Mean Squared Error for Deepseek: {mse_deepseek:.4f}")
print(f"Mean Squared Error for Qwen: {mse_qwen:.4f}")

Mean Squared Error for Mistral: 2.5500
Mean Squared Error for Deepseek: 1.5500
Mean Squared Error for Qwen: 1.6000


In [4]:
# print all the scores for all the models with the manual scores
print("Manual Scores vs Mistral Scores:")
sum_judge = 0
sum_manual = 0
for i in range(len(manual_scores_m)):
    sum_judge += mistral_scores[i]
    sum_manual += manual_scores_m[i]
    print(f"Manual Score: {manual_scores_m[i]}, Mistral Score: {mistral_scores[i]}")
# print the average score for mistral
print(f"Average Mistral Score: {sum_judge/len(manual_scores_m):.4f}")
print(f"Average manual Score: {sum_manual/len(manual_scores_m):.4f}")

Manual Scores vs Mistral Scores:
Manual Score: 2, Mistral Score: 4
Manual Score: 3, Mistral Score: 3
Manual Score: 3, Mistral Score: 3
Manual Score: 1, Mistral Score: 4
Manual Score: 3, Mistral Score: 3
Manual Score: 2, Mistral Score: 3
Manual Score: 2, Mistral Score: 2
Manual Score: 3, Mistral Score: 3
Manual Score: 2, Mistral Score: 2
Manual Score: 2, Mistral Score: 3
Manual Score: 2, Mistral Score: 3
Manual Score: 2, Mistral Score: 2
Manual Score: 4, Mistral Score: 3
Manual Score: 1, Mistral Score: 3
Manual Score: 2, Mistral Score: 3
Manual Score: 1, Mistral Score: 4
Manual Score: 3, Mistral Score: 3
Manual Score: 2, Mistral Score: 3
Manual Score: 2, Mistral Score: 3
Manual Score: 2, Mistral Score: 4
Average Mistral Score: 3.0500
Average manual Score: 2.2000


In [5]:
print("\n -------- \nManual Scores vs Deepseek Scores:")
sum_judge = 0
sum_manual = 0
for i in range(len(manual_scores_d)):
    sum_judge += deepseek_scores[i]
    sum_manual += manual_scores_d[i]
    print(f"Manual Score: {manual_scores_d[i]}, Deepseek Score: {deepseek_scores[i]}")
print(f"Average Deepseek Score: {sum_judge/len(manual_scores_d):.4f}")
print(f"Average manual Score: {sum_manual/len(manual_scores_d):.4f}")


 -------- 
Manual Scores vs Deepseek Scores:
Manual Score: 4, Deepseek Score: 3
Manual Score: 3, Deepseek Score: 3
Manual Score: 2, Deepseek Score: 3
Manual Score: 2, Deepseek Score: 2
Manual Score: 2, Deepseek Score: 3
Manual Score: 3, Deepseek Score: 3
Manual Score: 1, Deepseek Score: 1
Manual Score: 4, Deepseek Score: 3
Manual Score: 2, Deepseek Score: 4
Manual Score: 3, Deepseek Score: 3
Manual Score: 3, Deepseek Score: 3
Manual Score: 3, Deepseek Score: 4
Manual Score: 3, Deepseek Score: 3
Manual Score: 2, Deepseek Score: 3
Manual Score: 1, Deepseek Score: 1
Manual Score: 3, Deepseek Score: 4
Manual Score: 3, Deepseek Score: 3
Manual Score: 3, Deepseek Score: 3
Manual Score: 3, Deepseek Score: 3
Manual Score: 4, Deepseek Score: 3
Average Deepseek Score: 2.9000
Average manual Score: 2.7000


In [46]:
print(" \n ------- \nManual Scores vs Qwen Scores:")
sum_judge = 0
sum_manual = 0
for i in range(len(manual_scores_q)):
    sum_judge += qwen_scores[i]
    sum_manual += manual_scores_q[i]
    print(f"Manual Score: {manual_scores_q[i]}, Qwen Score: {qwen_scores[i]}")
print(f"Average Qwen Score: {sum_judge/len(manual_scores_q):.4f}")
print(f"Average manual Score: {sum_manual/len(manual_scores_q):.4f}")

 
 ------- 
Manual Scores vs Qwen Scores:
Manual Score: 4, Qwen Score: 4
Manual Score: 3, Qwen Score: 4
Manual Score: 4, Qwen Score: 4
Manual Score: 3, Qwen Score: 3
Manual Score: 4, Qwen Score: 3
Manual Score: 4, Qwen Score: 4
Manual Score: 4, Qwen Score: 2
Manual Score: 5, Qwen Score: 3
Manual Score: 2, Qwen Score: 4
Manual Score: 3, Qwen Score: 3
Manual Score: 4, Qwen Score: 3
Manual Score: 3, Qwen Score: 3
Manual Score: 5, Qwen Score: 3
Manual Score: 3, Qwen Score: 3
Manual Score: 3, Qwen Score: 3
Manual Score: 2, Qwen Score: 4
Manual Score: 4, Qwen Score: 4
Manual Score: 3, Qwen Score: 5
Manual Score: 5, Qwen Score: 3
Manual Score: 3, Qwen Score: 4
Average Qwen Score: 3.4500
Average manual Score: 3.5500


In [47]:
mistral.describe()

Unnamed: 0,mistral,p_mistral_vote
count,0.0,97.0
mean,,3.175258
std,,0.645664
min,,2.0
25%,,3.0
50%,,3.0
75%,,4.0
max,,4.0


In [48]:
deepseek.describe()

Unnamed: 0,p_deepseek_vote
count,97.0
mean,3.226804
std,0.71451
min,1.0
25%,3.0
50%,3.0
75%,4.0
max,4.0


In [49]:
qwen.describe()

Unnamed: 0,p_qwen_vote
count,97.0
mean,3.391753
std,0.884551
min,1.0
25%,3.0
50%,3.0
75%,4.0
max,5.0
