## Import Library

In [28]:
import json
from bert_score import score
from sentence_transformers import SentenceTransformer, util

## Fine-Tuning Model Evaluation

### With Accuracy

In [26]:
def compute_bertscore(system_response, reference_responses):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    elif isinstance(reference_responses, list):
        # Pastikan semua elemen dalam list adalah string
        reference_responses = [str(ref) for ref in reference_responses]
    
    # Hitung BERTScore
    P, R, F1 = score([system_response] * len(reference_responses), reference_responses, lang="en", verbose=False)
    
    # Mengembalikan rata-rata BERTScore
    return P.mean().item(), R.mean().item(), F1.mean().item()


# Fungsi untuk menghitung Cosine Similarity menggunakan Sentence-BERT
def compute_cosine_similarity(system_response, reference_responses, model):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    elif isinstance(reference_responses, list):
        reference_responses = [str(ref) for ref in reference_responses]

    system_embedding = model.encode(system_response, convert_to_tensor=True)
    reference_embeddings = model.encode(reference_responses, convert_to_tensor=True)
    cosine_scores = util.cos_sim(system_embedding, reference_embeddings)
    return cosine_scores

# Fungsi untuk menghitung BERT Accuracy
def compute_bert_accuracy(system_response, reference_responses, threshold=0.7):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    elif isinstance(reference_responses, list):
        reference_responses = [str(ref) for ref in reference_responses]
    
    # Hitung BERTScore
    P, R, F1 = score([system_response] * len(reference_responses), reference_responses, lang="en", verbose=False)
    
    # Hitung accuracy berdasarkan threshold
    matches = (F1 >= threshold).float()
    accuracy = matches.mean().item()
    
    return accuracy

# Memuat data dari file JSON
with open('fine_tuning_evals.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Memuat model Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')

# Menyimpan hasil evaluasi
bertscore_results = []
cosine_results = []
accuracy_results = []

# Proses evaluasi untuk setiap item dalam data
for item in data:
    system_response = str(item["system_response"])  # Pastikan system_response adalah string
    reference_responses = item["reference_responses"]  # Akan dikonversi di dalam fungsi
    
    # Menghitung BERTScore
    bert_precision, bert_recall, bert_f1 = compute_bertscore(system_response, reference_responses)
    bertscore_results.append((bert_precision, bert_recall, bert_f1))
    
    # Menghitung Cosine Similarity
    cosine_scores = compute_cosine_similarity(system_response, reference_responses, model)
    cosine_results.append(cosine_scores)
    
    # Menghitung BERT Accuracy
    accuracy = compute_bert_accuracy(system_response, reference_responses)
    accuracy_results.append(accuracy)

# Menampilkan hasil BERTScore, Cosine Similarity, dan BERT Accuracy untuk setiap item
for i, item in enumerate(data):
    print(f"Item {i+1} - User Query: {item['user_query']}")
    print(f"System Response: {item['system_response']}")
    print(f"Reference Responses: {item['reference_responses']}")
    print(f"BERTScore - Precision: {bertscore_results[i][0]:.4f}, Recall: {bertscore_results[i][1]:.4f}, F1: {bertscore_results[i][2]:.4f}")
    print(f"BERT Accuracy: {accuracy_results[i]:.4f}")
    
    # Menampilkan Cosine Similarity untuk setiap reference
    print(f"Cosine Similarity with reference responses:")
    for j, score in enumerate(cosine_results[i][0]):
        print(f"  Reference {j+1}: {score.item():.4f}")
    
    print("\n" + "-"*80 + "\n")

# Menampilkan rata-rata metrik untuk seluruh dataset
avg_bert_precision = sum([result[0] for result in bertscore_results]) / len(bertscore_results)
avg_bert_recall = sum([result[1] for result in bertscore_results]) / len(bertscore_results)
avg_bert_f1 = sum([result[2] for result in bertscore_results]) / len(bertscore_results)
avg_bert_accuracy = sum(accuracy_results) / len(accuracy_results)

print(f"Average BERT Precision: {avg_bert_precision:.4f}")
print(f"Average BERT Recall: {avg_bert_recall:.4f}")
print(f"Average BERT F1: {avg_bert_f1:.4f}")
print(f"Average BERT Accuracy: {avg_bert_accuracy:.4f}")

# Menampilkan rata-rata Cosine Similarity untuk seluruh dataset
avg_cosine_similarity = sum([score.mean().item() for score in cosine_results]) / len(cosine_results)
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Item 1 - User Query: Looking for a Toyota car under 150 million with automatic transmission
System Response: 🚗 Passo Model 2013 (2013)
📝 Car Detail:
• Brands: Toyota
• KMs Driven: 90000 km
• Fuel: Petrol
• Transmission: Automatic
🌟 Car Features:
• ABS, Air Bags, Air Conditioning, AM/FM Radio, CD Player, Front Speakers, Immobilizer Key, Keyless Entry, Power Locks, Power Mirrors, Power Steering, Power Windows, Rear Camera, USB and Auxillary Cable
💰 Price (IDR): IDR 144,151,500.
Reference Responses: Toyota Passo Model 2013 with automatic transmission price below 150 million rupiah. A 2013 car with a mileage of 90,000 km using gasoline. Equipped with ABS and air bag safety features, AC comfort, audio, rear camera, and keyless entry. Car price IDR 144,151,500.
BERTScore - Precision: 0.8187, Recall: 0.8828, F1: 0.8496
BERT Accuracy: 1.0000
Cosine Similarity with reference responses:
  Reference 1: 0.8181

--------------------------------------------------------------------------------

Item 

### Without Accuracy

In [None]:
# Fungsi untuk menghitung BERTScore
def compute_bertscore(system_response, reference_responses):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    
    # Hitung BERTScore
    P, R, F1 = score([system_response] * len(reference_responses), reference_responses, lang="en", verbose=False)
    
    # Mengembalikan rata-rata BERTScore
    return P.mean().item(), R.mean().item(), F1.mean().item()


# Fungsi untuk menghitung Cosine Similarity menggunakan Sentence-BERT
def compute_cosine_similarity(system_response, reference_responses, model):
    system_embedding = model.encode(system_response, convert_to_tensor=True)
    reference_embeddings = model.encode(reference_responses, convert_to_tensor=True)
    cosine_scores = util.cos_sim(system_embedding, reference_embeddings)
    return cosine_scores

# Memuat data dari file JSON
with open('fine_tuning_evals.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Memuat model Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')

# Menyimpan hasil evaluasi
bertscore_results = []
cosine_results = []

# Proses evaluasi untuk setiap item dalam data
for item in data:
    system_response = item["system_response"]
    reference_responses = item["reference_responses"]
    
    # Menghitung BERTScore
    bert_precision, bert_recall, bert_f1 = compute_bertscore(system_response, reference_responses)
    bertscore_results.append((bert_precision, bert_recall, bert_f1))
    
    # Menghitung Cosine Similarity
    cosine_scores = compute_cosine_similarity(system_response, reference_responses, model)
    cosine_results.append(cosine_scores)

# Menampilkan hasil BERTScore dan Cosine Similarity untuk setiap item
for i, item in enumerate(data):
    print(f"Item {i+1} - User Query: {item['user_query']}")
    print(f"System Response: {item['system_response']}")
    print(f"Reference Responses: {item['reference_responses']}")
    print(f"BERTScore - Precision: {bertscore_results[i][0]:.4f}, Recall: {bertscore_results[i][1]:.4f}, F1: {bertscore_results[i][2]:.4f}")
    
    # Menampilkan Cosine Similarity untuk setiap reference
    print(f"Cosine Similarity with reference responses:")
    for j, score in enumerate(cosine_results[i][0]):
        print(f"  Reference {j+1}: {score.item():.4f}")
    
    print("\n" + "-"*80 + "\n")

# Menampilkan rata-rata BERTScore untuk seluruh dataset
avg_bert_precision = sum([result[0] for result in bertscore_results]) / len(bertscore_results)
avg_bert_recall = sum([result[1] for result in bertscore_results]) / len(bertscore_results)
avg_bert_f1 = sum([result[2] for result in bertscore_results]) / len(bertscore_results)

print(f"Average BERT Precision: {avg_bert_precision:.4f}")
print(f"Average BERT Recall: {avg_bert_recall:.4f}")
print(f"Average BERT F1: {avg_bert_f1:.4f}")

# Menampilkan rata-rata Cosine Similarity untuk seluruh dataset
avg_cosine_similarity = sum([score.mean().item() for score in cosine_results]) / len(cosine_results)
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Item 1 - User Query: Looking for a Toyota car under 150 million with automatic transmission
System Response: 🚗 Passo Model 2013 (2013)
📝 Car Detail:
• Brands: Toyota
• KMs Driven: 90000 km
• Fuel: Petrol
• Transmission: Automatic
🌟 Car Features:
• ABS, Air Bags, Air Conditioning, AM/FM Radio, CD Player, Front Speakers, Immobilizer Key, Keyless Entry, Power Locks, Power Mirrors, Power Steering, Power Windows, Rear Camera, USB and Auxillary Cable
💰 Price (IDR): IDR 144,151,500.
Reference Responses: Toyota Passo Model 2013 with automatic transmission price below 150 million rupiah. A 2013 car with a mileage of 90,000 km using gasoline. Equipped with ABS and air bag safety features, AC comfort, audio, rear camera, and keyless entry. Car price IDR 144,151,500.
BERTScore - Precision: 0.8187, Recall: 0.8828, F1: 0.8496
Cosine Similarity with reference responses:
  Reference 1: 0.8181

--------------------------------------------------------------------------------

Item 2 - User Query: Find m

## Pre Fine-Tuning Model Evaluation

### With Accuracy

In [29]:
def compute_bertscore(system_response, reference_responses):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    elif isinstance(reference_responses, list):
        # Pastikan semua elemen dalam list adalah string
        reference_responses = [str(ref) for ref in reference_responses]
    
    # Hitung BERTScore
    P, R, F1 = score([system_response] * len(reference_responses), reference_responses, lang="en", verbose=False)
    
    # Mengembalikan rata-rata BERTScore
    return P.mean().item(), R.mean().item(), F1.mean().item()


# Fungsi untuk menghitung Cosine Similarity menggunakan Sentence-BERT
def compute_cosine_similarity(system_response, reference_responses, model):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    elif isinstance(reference_responses, list):
        reference_responses = [str(ref) for ref in reference_responses]

    system_embedding = model.encode(system_response, convert_to_tensor=True)
    reference_embeddings = model.encode(reference_responses, convert_to_tensor=True)
    cosine_scores = util.cos_sim(system_embedding, reference_embeddings)
    return cosine_scores

# Fungsi untuk menghitung BERT Accuracy
def compute_bert_accuracy(system_response, reference_responses, threshold=0.7):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    elif isinstance(reference_responses, list):
        reference_responses = [str(ref) for ref in reference_responses]
    
    # Hitung BERTScore
    P, R, F1 = score([system_response] * len(reference_responses), reference_responses, lang="en", verbose=False)
    
    # Hitung accuracy berdasarkan threshold
    matches = (F1 >= threshold).float()
    accuracy = matches.mean().item()
    
    return accuracy

# Memuat data dari file JSON
with open('pre_fine_tuning_evals.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Memuat model Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')

# Menyimpan hasil evaluasi
bertscore_results = []
cosine_results = []
accuracy_results = []

# Proses evaluasi untuk setiap item dalam data
for item in data:
    system_response = str(item["system_response"])  # Pastikan system_response adalah string
    reference_responses = item["reference_responses"]  # Akan dikonversi di dalam fungsi
    
    # Menghitung BERTScore
    bert_precision, bert_recall, bert_f1 = compute_bertscore(system_response, reference_responses)
    bertscore_results.append((bert_precision, bert_recall, bert_f1))
    
    # Menghitung Cosine Similarity
    cosine_scores = compute_cosine_similarity(system_response, reference_responses, model)
    cosine_results.append(cosine_scores)
    
    # Menghitung BERT Accuracy
    accuracy = compute_bert_accuracy(system_response, reference_responses)
    accuracy_results.append(accuracy)

# Menampilkan hasil BERTScore, Cosine Similarity, dan BERT Accuracy untuk setiap item
for i, item in enumerate(data):
    print(f"Item {i+1} - User Query: {item['user_query']}")
    print(f"System Response: {item['system_response']}")
    print(f"Reference Responses: {item['reference_responses']}")
    print(f"BERTScore - Precision: {bertscore_results[i][0]:.4f}, Recall: {bertscore_results[i][1]:.4f}, F1: {bertscore_results[i][2]:.4f}")
    print(f"BERT Accuracy: {accuracy_results[i]:.4f}")
    
    # Menampilkan Cosine Similarity untuk setiap reference
    print(f"Cosine Similarity with reference responses:")
    for j, score in enumerate(cosine_results[i][0]):
        print(f"  Reference {j+1}: {score.item():.4f}")
    
    print("\n" + "-"*80 + "\n")

# Menampilkan rata-rata metrik untuk seluruh dataset
avg_bert_precision = sum([result[0] for result in bertscore_results]) / len(bertscore_results)
avg_bert_recall = sum([result[1] for result in bertscore_results]) / len(bertscore_results)
avg_bert_f1 = sum([result[2] for result in bertscore_results]) / len(bertscore_results)
avg_bert_accuracy = sum(accuracy_results) / len(accuracy_results)

print(f"Average BERT Precision: {avg_bert_precision:.4f}")
print(f"Average BERT Recall: {avg_bert_recall:.4f}")
print(f"Average BERT F1: {avg_bert_f1:.4f}")
print(f"Average BERT Accuracy: {avg_bert_accuracy:.4f}")

# Menampilkan rata-rata Cosine Similarity untuk seluruh dataset
avg_cosine_similarity = sum([score.mean().item() for score in cosine_results]) / len(cosine_results)
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Item 1 - User Query: Looking for a Toyota car under 150 million with automatic transmission
System Response: 🚗 Toyota Passo 2014 (2014)
 📝 Car Detail: 
• Brands: Toyota 
• KMs Driven: 82000 km 
• Fuel: Petrol 
• Transmission: Automatic
🌟 Car Features: 
• ABS, Air Bags, Air Conditioning, CD Player, Front Speakers, Keyless Entry, Navigation System, Power Locks, Power Mirrors, Power Steering, Power Windows, Rear Speakers, Rear Camera
💰 Price (IDR): IDR 157,153,400
Reference Responses: Toyota Passo Model 2013 with automatic transmission price below 150 million rupiah. A 2013 car with a mileage of 90,000 km using gasoline. Equipped with ABS and air bag safety features, AC comfort, audio, rear camera, and keyless entry. Car price IDR 144,151,500.
BERTScore - Precision: 0.8090, Recall: 0.8753, F1: 0.8408
BERT Accuracy: 1.0000
Cosine Similarity with reference responses:
  Reference 1: 0.7866

--------------------------------------------------------------------------------

Item 2 - User Query:

### Without Accuracy

In [23]:
# Fungsi untuk menghitung BERTScore
def compute_bertscore(system_response, reference_responses):
    # Pastikan reference_responses adalah list
    if isinstance(reference_responses, str):
        reference_responses = [reference_responses]
    
    # Hitung BERTScore
    P, R, F1 = score([system_response] * len(reference_responses), reference_responses, lang="en", verbose=False)
    
    # Mengembalikan rata-rata BERTScore
    return P.mean().item(), R.mean().item(), F1.mean().item()


# Fungsi untuk menghitung Cosine Similarity menggunakan Sentence-BERT
def compute_cosine_similarity(system_response, reference_responses, model):
    system_embedding = model.encode(system_response, convert_to_tensor=True)
    reference_embeddings = model.encode(reference_responses, convert_to_tensor=True)
    cosine_scores = util.cos_sim(system_embedding, reference_embeddings)
    return cosine_scores

# Memuat data dari file JSON
with open('pre_fine_tuning_evals.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Memuat model Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')

# Menyimpan hasil evaluasi
bertscore_results = []
cosine_results = []

# Proses evaluasi untuk setiap item dalam data
for item in data:
    system_response = item["system_response"]
    reference_responses = item["reference_responses"]
    
    # Menghitung BERTScore
    bert_precision, bert_recall, bert_f1 = compute_bertscore(system_response, reference_responses)
    bertscore_results.append((bert_precision, bert_recall, bert_f1))
    
    # Menghitung Cosine Similarity
    cosine_scores = compute_cosine_similarity(system_response, reference_responses, model)
    cosine_results.append(cosine_scores)

# Menampilkan hasil BERTScore dan Cosine Similarity untuk setiap item
for i, item in enumerate(data):
    print(f"Item {i+1} - User Query: {item['user_query']}")
    print(f"System Response: {item['system_response']}")
    print(f"Reference Responses: {item['reference_responses']}")
    print(f"BERTScore - Precision: {bertscore_results[i][0]:.4f}, Recall: {bertscore_results[i][1]:.4f}, F1: {bertscore_results[i][2]:.4f}")
    
    # Menampilkan Cosine Similarity untuk setiap reference
    print(f"Cosine Similarity with reference responses:")
    for j, score in enumerate(cosine_results[i][0]):
        print(f"  Reference {j+1}: {score.item():.4f}")
    
    print("\n" + "-"*80 + "\n")

# Menampilkan rata-rata BERTScore untuk seluruh dataset
avg_bert_precision = sum([result[0] for result in bertscore_results]) / len(bertscore_results)
avg_bert_recall = sum([result[1] for result in bertscore_results]) / len(bertscore_results)
avg_bert_f1 = sum([result[2] for result in bertscore_results]) / len(bertscore_results)

print(f"Average BERT Precision: {avg_bert_precision:.4f}")
print(f"Average BERT Recall: {avg_bert_recall:.4f}")
print(f"Average BERT F1: {avg_bert_f1:.4f}")

# Menampilkan rata-rata Cosine Similarity untuk seluruh dataset
avg_cosine_similarity = sum([score.mean().item() for score in cosine_results]) / len(cosine_results)
print(f"Average Cosine Similarity: {avg_cosine_similarity:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Item 1 - User Query: Looking for a Toyota car under 150 million with automatic transmission
System Response: 🚗 Toyota Passo 2014 (2014)
 📝 Car Detail: 
• Brands: Toyota 
• KMs Driven: 82000 km 
• Fuel: Petrol 
• Transmission: Automatic
🌟 Car Features: 
• ABS, Air Bags, Air Conditioning, CD Player, Front Speakers, Keyless Entry, Navigation System, Power Locks, Power Mirrors, Power Steering, Power Windows, Rear Speakers, Rear Camera
💰 Price (IDR): IDR 157,153,400
Reference Responses: Toyota Passo Model 2013 with automatic transmission price below 150 million rupiah. A 2013 car with a mileage of 90,000 km using gasoline. Equipped with ABS and air bag safety features, AC comfort, audio, rear camera, and keyless entry. Car price IDR 144,151,500.
BERTScore - Precision: 0.8090, Recall: 0.8753, F1: 0.8408
Cosine Similarity with reference responses:
  Reference 1: 0.7866

--------------------------------------------------------------------------------

Item 2 - User Query: Find me a car with su