In [None]:
# !pip install sentence-transformers
from sentence_transformers import SentenceTransformer # model to convert text into code, "sentence classification"
import numpy as np # store converted text into numbers
import pandas as pd # dataset

In [None]:
df = pd.read_csv('write_file_here.csv') # can be automatized as well

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
## new dataframe to store our data
answer_embeddings = df['answer'].apply(lambda x: model.encode(x, convert_to_tensor=True))
response_embeddings = df['model_response'].apply(lambda x: model.encode(x, convert_to_tensor=True))

embedding_df = pd.DataFrame({
    'answer_embedding': answer_embeddings,
    'model_response_embedding': response_embeddings
})

In [None]:
## COSINE SIMMULARITY

import torch # pytorch

answers = torch.stack(embedding_df['answer_embedding'].tolist())          # 768
responses = torch.stack(embedding_df['model_response_embedding'].tolist()) # 768

cos_sim = torch.nn.functional.cosine_similarity(answers, responses, dim=1)

embedding_df['cosine_similarity'] = cos_sim.tolist()

print("cosine similarity:", cos_sim.mean().item())

In [None]:
## BERTScore
!pip install bert_score
from bert_score import score

P, R, F1 = score(df['answer'].tolist(), df['model_response'].tolist(), lang='en')
print(f"BERTScore F1: {F1.mean().item():.4f}")

In [None]:
### comparison (predefined manually for now, can be automatized)
comparison = pd.DataFrame([
    {
        "model": "falcon",
        "bert_score": 0.8710,
        "cosine_similarity": 0.6338
    },
    {
        "model": "phi",
        "bert_score": 0.8722,
        "cosine_similarity": 0.6522
    },
    {
        "model": "tiny-llama",
        "bert_score": 0.8835,
        "cosine_similarity": 0.7044
    },
    {
        "model": "llama-regular",
        "bert_score": 0.8609,
        "cosine_similarity": 0.6323
    }
])

In [None]:
### vizualisation
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

# bert
axs[0].bar(comparison['model'], comparison['bert_score'], color='skyblue')
axs[0].set_title('BERTScore')
axs[0].set_ylim(0.85, 0.89)
axs[0].set_ylabel('BERTScore')

# cosines
axs[1].bar(comparison['model'], comparison['cosine_similarity'], color='lightcoral')
axs[1].set_title('Cosine Similarity')
axs[1].set_ylim(0.6, 0.72)
axs[1].set_ylabel('Similarity')

# to show
plt.tight_layout()
plt.show()

In [None]:
### Training loss visualisation
import matplotlib.pyplot as plt

# 500 steps cap
steps = [
    20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300,
    320, 340, 360, 380, 400, 420, 440, 460, 480, 500
]

deepseek_training_loss = [
    2.765200, 2.728700, 2.743500, 2.719200, 2.689300, 2.666000, 2.637700, 2.610000,
    2.558400, 2.548900, 2.512600, 2.502900, 2.473300, 2.434500, 2.391300, 2.365100,
    2.337500, 2.292600, 2.257300, 2.223900, 2.198400, 2.147000, 2.112200, 2.073000,
    2.051100
]

mistral_training_loss = [
    2.476000, 2.366600, 2.306900, 2.223500, 2.149900,
    2.073800, 2.005600, 1.935900, 1.866000, 1.818300,
    1.741800, 1.688900, 1.635900, 1.565200, 1.510900,
    1.459100, 1.412600, 1.371500, 1.315800, 1.292200,
    1.274700, 1.231600, 1.213700, 1.197200, 1.193000
]

llama_training_loss = [
    2.7158, 2.6620, 2.6716, 2.6292, 2.5907,
    2.5482, 2.5087, 2.4656, 2.4141, 2.3933,
    2.3407, 2.3150, 2.2749, 2.2214, 2.1656,
    2.1259, 2.0871, 2.0329, 1.9817, 1.9444,
    1.9051, 1.8472, 1.7964, 1.7537, 1.7201
]

plt.figure(figsize=(10,6))
plt.plot(steps, deepseek_training_loss, marker='o', linestyle='-', color='blue')
plt.plot(steps, mistral_training_loss, marker='o', linestyle='-', color='red')
plt.plot(steps, llama_training_loss, marker='o', linestyle='-', color='black')
plt.title('Training Loss over Steps')
plt.xlabel('Step')
plt.ylabel('Training Loss')
plt.grid(True)
plt.show()
