In [None]:
!pip install datasets



In [None]:
pip install faiss-gpu-cu12



In [None]:
import os
# os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

In [None]:
import faiss
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer
from datasets import Dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Veri Setini Yükleme
dataset = load_dataset("Metin/WikiRAG-TR", split="train[:1500]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
original_model_name = "thenlper/gte-base"
# original_model_name = "intfloat/multilingual-e5-small"
# original_model_name = "sentence-transformers/all-MiniLM-L12-v2"

In [None]:
model = SentenceTransformer(original_model_name).to(device)

In [None]:
#Embedding Alma Fonksiyonu
def get_embeddings(texts, tokenizer, model):
    embeddings = []
    print(len(texts))
    for text in texts:
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy()[0])
    return np.array(embeddings)

In [None]:
# WikiRag Verisetindeki split pointslere göre chunk'ları çıkarma işlemi
contexts = []
questions = []
labels = []
faiss_indices = []

current_index = 0
for example in dataset:
    questions.append(example['question'])
    correct_intro_idx = example['correct_intro_idx']
    split_points = list(map(int, example['ctx_split_points'][1:-1].split(',')))
    split_points = [0] + split_points

    for idx in range(len(split_points) - 1):
        context_chunk = example['context'][split_points[idx]:split_points[idx + 1]]
        contexts.append(context_chunk)
        if idx == correct_intro_idx:
            labels.append(current_index)
        current_index += 1

print("Total Chunk: ", len(contexts))

Total Chunk:  6066


In [None]:
# context_embeddings = get_embeddings(contexts, tokenizer, model)
context_embeddings = model.encode(contexts, convert_to_tensor=False, show_progress_bar=True)

Batches:   0%|          | 0/190 [00:00<?, ?it/s]

In [None]:
# question_embeddings = get_embeddings(questions, tokenizer, model)
question_embeddings = model.encode(questions, convert_to_tensor=False, show_progress_bar=True)

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
# Vector Database'i oluşturma

embedding_dim = context_embeddings.shape[1]
faiss.normalize_L2(context_embeddings)
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(context_embeddings)

In [None]:
retrieved_top_1 = []
retrieved_top_2 = []
retrieved_top_3 = []
retrieved_top_4 = []
retrieved_top_5 = []

In [None]:
# Vector Database'inde yakın cevapları arama işlemi
for idx, q_emb in enumerate(question_embeddings):
    distances, indices = faiss_index.search(np.expand_dims(q_emb, axis=0), k=5)
    correct_idx = labels[idx]
    retrieved_top_1.append(correct_idx in indices[0][:1])
    retrieved_top_2.append(correct_idx in indices[0][:2])
    retrieved_top_3.append(correct_idx in indices[0][:3])
    retrieved_top_4.append(correct_idx in indices[0][:4])
    retrieved_top_5.append(correct_idx in indices[0][:5])

In [None]:
accuracy_top_1 = accuracy_score(retrieved_top_1, [1] * len(retrieved_top_1))
accuracy_top_2 = accuracy_score(retrieved_top_2, [1] * len(retrieved_top_2))
accuracy_top_3 = accuracy_score(retrieved_top_3, [1] * len(retrieved_top_3))
accuracy_top_4 = accuracy_score(retrieved_top_4, [1] * len(retrieved_top_4))
accuracy_top_5 = accuracy_score(retrieved_top_5, [1] * len(retrieved_top_5))

In [None]:
print(f"Original Model Top-1 Accuracy: {accuracy_top_1:.2f}")
print(f"Original Model Top-2 Accuracy: {accuracy_top_2:.2f}")
print(f"Original Model Top-3 Accuracy: {accuracy_top_3:.2f}")
print(f"Original Model Top-4 Accuracy: {accuracy_top_4:.2f}")
print(f"Original Model Top-5 Accuracy: {accuracy_top_5:.2f}")

Original Model Top-1 Accuracy: 0.46
Original Model Top-2 Accuracy: 0.68
Original Model Top-3 Accuracy: 0.74
Original Model Top-4 Accuracy: 0.78
Original Model Top-5 Accuracy: 0.80


In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import json

In [None]:
def load_data(data):
    examples = []
    for item in data:
        correct_intro_idx = item['correct_intro_idx']
        split_points = list(map(int, item['ctx_split_points'][1:-1].split(',')))
        split_points = [0] + split_points
        start_idx = split_points[correct_intro_idx]
        end_idx = split_points[correct_intro_idx + 1]

        context = item['context'][start_idx:end_idx]
        query = item["question"]
        # context = item["context"]
        response = item["answer"]

        text = f"{query} [SEP] {context}"  # Sorgu ve bağlamı birleştiriyoruz
        examples.append(InputExample(texts=[text, response]))
    return examples

In [None]:
model = SentenceTransformer(original_model_name).to(device)

In [None]:
train_dataset = load_dataset("Metin/WikiRAG-TR", split="train[1500:]")

In [None]:
train_data = load_data(train_dataset)

In [None]:
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)

In [None]:
train_loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    warmup_steps=100,
    output_path="fine_tuned_e5_model"
)



Step,Training Loss
500,0.1448
1000,0.0311


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
# new_model = SentenceTransformer("/content/fine_tuned_e5_model").to(device)
new_model = model

In [None]:
context_embeddings = new_model.encode(contexts, convert_to_tensor=False, show_progress_bar=True)

Batches:   0%|          | 0/190 [00:00<?, ?it/s]

In [None]:
question_embeddings = new_model.encode(questions, convert_to_tensor=False, show_progress_bar=True)

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
embedding_dim = context_embeddings.shape[1]
faiss.normalize_L2(context_embeddings)
faiss_new_index = faiss.IndexFlatIP(embedding_dim)
faiss_new_index.add(context_embeddings)

In [None]:
retrieved_top_1 = []
retrieved_top_2 = []
retrieved_top_3 = []
retrieved_top_4 = []
retrieved_top_5 = []

In [None]:
# Vector Database'inde yakın cevapları arama işlemi
for idx, q_emb in enumerate(question_embeddings):
    distances, indices = faiss_new_index.search(np.expand_dims(q_emb, axis=0), k=5)
    correct_idx = labels[idx]
    retrieved_top_1.append(correct_idx in indices[0][:1])
    retrieved_top_2.append(correct_idx in indices[0][:2])
    retrieved_top_3.append(correct_idx in indices[0][:3])
    retrieved_top_4.append(correct_idx in indices[0][:4])
    retrieved_top_5.append(correct_idx in indices[0][:5])

In [None]:
accuracy_top_1 = accuracy_score(retrieved_top_1, [1] * len(retrieved_top_1))
accuracy_top_2 = accuracy_score(retrieved_top_2, [1] * len(retrieved_top_2))
accuracy_top_3 = accuracy_score(retrieved_top_3, [1] * len(retrieved_top_3))
accuracy_top_4 = accuracy_score(retrieved_top_4, [1] * len(retrieved_top_4))
accuracy_top_5 = accuracy_score(retrieved_top_5, [1] * len(retrieved_top_5))

In [None]:
print(f"Finetuned Model Top-1 Accuracy: {accuracy_top_1:.2f}")
print(f"Finetuned Model Top-2 Accuracy: {accuracy_top_2:.2f}")
print(f"Finetuned Model Top-3 Accuracy: {accuracy_top_3:.2f}")
print(f"Finetuned Model Top-4 Accuracy: {accuracy_top_4:.2f}")
print(f"Finetuned Model Top-5 Accuracy: {accuracy_top_5:.2f}")

Finetuned Model Top-1 Accuracy: 0.57
Finetuned Model Top-2 Accuracy: 0.82
Finetuned Model Top-3 Accuracy: 0.88
Finetuned Model Top-4 Accuracy: 0.90
Finetuned Model Top-5 Accuracy: 0.92
