In [None]:
# Импортируем необходимые библиотеки
from sentence_transformers import SentenceTransformer
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.manifold import TSNE

base_model_name_or_path = "Snowflake/snowflake-arctic-embed-xs"
tuned_model_name_or_path = "./output/model_domain-en-ru-2024-05-18_14-12-55"
dataset_name = "evilfreelancer/php-ru-en"

# Загружаем модели SentencePiece
base_model = SentenceTransformer(base_model_name_or_path)
tuned_model = SentenceTransformer(tuned_model_name_or_path)

# Загрузка датасета с HuggingFace
dataset = load_dataset(dataset_name)

# Пример использования: предположим, что датасет содержит поля 'english' и 'russian' с соответствующими фразами
english_phrases = dataset['eval']['English']
russian_phrases = dataset['eval']['Russian']

# Получение эмбеддингов для первых 50 пар фраз
n_samples = 100
base_embeddings = []
fine_tuned_embeddings = []

for i in range(n_samples):
    eng_phrase = english_phrases[i]
    rus_phrase = russian_phrases[i]

    # Extract embeddings for base model
    base_eng_embedding = base_model.encode(eng_phrase)
    base_rus_embedding = base_model.encode(rus_phrase)
    base_embeddings.append(base_eng_embedding)
    base_embeddings.append(base_rus_embedding)

    # Extract embeddings for tuned model
    fine_tuned_eng_embedding = tuned_model.encode(eng_phrase)
    fine_tuned_rus_embedding = tuned_model.encode(rus_phrase)
    fine_tuned_embeddings.append(fine_tuned_eng_embedding)
    fine_tuned_embeddings.append(fine_tuned_rus_embedding)

In [None]:
# Функция для расчета косинусного сходства
def cosine_similarity(emb1, emb2):
    dot_product = np.dot(emb1, emb2)
    norm_emb1 = np.linalg.norm(emb1)
    norm_emb2 = np.linalg.norm(emb2)
    return dot_product / (norm_emb1 * norm_emb2)


# Расчет косинусного сходства для базовой и дообученной моделей
base_similarities = []
fine_tuned_similarities = []

for i in range(0, len(base_embeddings), 2):
    base_similarity = cosine_similarity(base_embeddings[i], base_embeddings[i + 1])
    fine_tuned_similarity = cosine_similarity(fine_tuned_embeddings[i], fine_tuned_embeddings[i + 1])

    base_similarities.append(base_similarity)
    fine_tuned_similarities.append(fine_tuned_similarity)

In [None]:
# Визуализация различий между базовой моделью и дообученной моделью
plt.figure(figsize=(12, 6))

plt.plot(base_similarities, label='Base Model Similarities', marker='o')
plt.plot(fine_tuned_similarities, label='Fine-Tuned Model Similarities', marker='o')

plt.title('Cosine Similarities between English and Russian Phrases')
plt.xlabel('Phrase Pair Index')
plt.ylabel('Cosine Similarity')
plt.legend()
plt.grid(True)
plt.show()

# TSNE визуализация эмбеддингов
tsne = TSNE(n_components=2, random_state=42)

base_embeddings_2d = tsne.fit_transform(np.array(base_embeddings))
fine_tuned_embeddings_2d = tsne.fit_transform(np.array(fine_tuned_embeddings))

plt.figure(figsize=(12, 6))

plt.scatter(base_embeddings_2d[:, 0], base_embeddings_2d[:, 1], label='Base Model Embeddings', alpha=0.5)
plt.scatter(fine_tuned_embeddings_2d[:, 0], fine_tuned_embeddings_2d[:, 1], label='Fine-Tuned Model Embeddings',
            alpha=0.5)

plt.title('TSNE Visualization of Embeddings')
plt.legend()
plt.show()