In [None]:
!pip install datasets

In [None]:
!pip install faiss-gpu

In [None]:
import faiss
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer
from datasets import Dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Veri Setini Yükleme
dataset = load_dataset("Metin/WikiRAG-TR", split="train[:1000]")

In [None]:
original_model_name = "intfloat/multilingual-e5-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(original_model_name)
model = AutoModel.from_pretrained(original_model_name).to(device)

In [None]:
#Embedding Alma Fonksiyonu
def get_embeddings(texts, tokenizer, model):
    embeddings = []
    print(len(texts))
    for text in texts:
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy()[0])
    return np.array(embeddings)

In [None]:
# WikiRag Verisetindeki split pointslere göre chunk'ları çıkarma işlemi
contexts = []
questions = []
labels = []
faiss_indices = []

current_index = 0
for example in dataset:
    questions.append(example['question'])
    correct_intro_idx = example['correct_intro_idx']
    split_points = list(map(int, example['ctx_split_points'][1:-1].split(',')))
    split_points = [0] + split_points

    for idx in range(len(split_points) - 1):
        context_chunk = example['context'][split_points[idx]:split_points[idx + 1]]
        contexts.append(context_chunk)
        if idx == correct_intro_idx:
            labels.append(current_index)
        current_index += 1

print("Total Chunk: ", len(contexts))

In [None]:
context_embeddings = get_embeddings(contexts, tokenizer, model)

In [None]:
question_embeddings = get_embeddings(questions, tokenizer, model)

In [None]:
# Vector Database'i oluşturma
embedding_dim = context_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(context_embeddings)

In [None]:
retrieved_top_1 = []
retrieved_top_5 = []

In [None]:
# Vector Database'inde yakın cevapları arama işlemi
for idx, q_emb in enumerate(question_embeddings):
    distances, indices = faiss_index.search(np.expand_dims(q_emb, axis=0), k=5)
    correct_idx = labels[idx]
    retrieved_top_1.append(correct_idx in indices[0][:1])
    retrieved_top_5.append(correct_idx in indices[0][:5])

In [None]:
accuracy_top_1 = accuracy_score(retrieved_top_1, [1] * len(retrieved_top_1))
accuracy_top_5 = accuracy_score(retrieved_top_5, [1] * len(retrieved_top_5))

In [None]:
print(f"Original Model Top-1 Accuracy: {accuracy_top_1:.2f}")
print(f"Original Model Top-5 Accuracy: {accuracy_top_5:.2f}")

In [None]:
train_dataset = load_dataset("WhiteAngelss/Turkce-Duygu-Analizi-Dataset")

In [None]:
train_subset = train_dataset['train'].train_test_split(test_size=0.01)['test']
test_subset = train_dataset['test'].train_test_split(test_size=0.01)['test']

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

label_counts = Counter(train_subset['label'])

# Counter sonuçlarını ayır
label_names = list(label_counts.keys())
label_values = list(label_counts.values())

# Bar grafiği çizdir
plt.figure(figsize=(8, 6))
plt.bar(label_names, label_values)
plt.xlabel('Labels')
plt.ylabel('Counts')
plt.title('Train Dataset')
plt.show()

In [None]:
label_counts = Counter(test_subset['label'])

# Counter sonuçlarını ayır
label_names = list(label_counts.keys())
label_values = list(label_counts.values())

# Bar grafiği çizdir
plt.figure(figsize=(8, 6))
plt.bar(label_names, label_values)
plt.xlabel('Labels')
plt.ylabel('Counts')
plt.title('Test Dataset')
plt.show()

In [None]:
len(train_subset)

In [None]:
train_subset[0]

In [None]:
test_subset[0]

In [None]:
small_dataset = {
    'train': train_subset,
    'test': test_subset
}

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model_name = "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

In [None]:
def preprocess_labels(example):
    example['label'] = label_mapping[example['label']]
    return example

In [None]:
label_mapping = {"Positive": 0, "Negative": 1, "Notr": 2}

In [None]:
# Veri Setini eğitime uygun hale getirme işlemi
encoded_train = train_subset.map(preprocess_function, batched=True)
encoded_test = test_subset.map(preprocess_function, batched=True)

encoded_train = encoded_train.map(preprocess_labels)
encoded_test = encoded_test.map(preprocess_labels)

encoded_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
encoded_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
encoded_train[0]

In [None]:
from torch.utils.data import DataLoader

In [None]:
batch_size = 16

train_dataloader = DataLoader(encoded_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(encoded_test, batch_size=batch_size)

In [None]:
from transformers import AdamW
import torch
from tqdm import tqdm

# Optimizasyon fonksiyonu
optimizer = AdamW(model.parameters(), lr=5e-5)

# Cihaz ayarı
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 2
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc="Training", leave=True)

    for batch in progress_bar:
        optimizer.zero_grad()

        # Verileri modele gönder
        inputs = {key: batch[key].to(device) for key in ['input_ids', 'attention_mask']}
        labels = batch['label'].to(device)

        # Model ileri geçişi
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()

        # Geriye yayılım
        loss.backward()
        optimizer.step()

        # İlerleme çubuğunda loss'u güncelle
        progress_bar.set_postfix({"loss": loss.item()})

    print(f"Epoch {epoch + 1} tamamlandı. Ortalama Loss: {epoch_loss / len(train_dataloader):.4f}")

In [None]:
from sklearn.metrics import classification_report

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        inputs = {key: batch[key].to(device) for key in ['input_ids', 'attention_mask']}
        labels = batch['label'].to(device)

        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, axis=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(classification_report(true_labels, predictions, target_names=['Positive', 'Negative', 'Notr']))

In [None]:
model.save_pretrained("e5-turkish-base_small")
tokenizer.save_pretrained("e5-turkish-base_small")

In [None]:
original_model_name = "e5-turkish-base_small"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(original_model_name)
model = AutoModel.from_pretrained(original_model_name).to(device)

In [None]:
contexts = []
questions = []
labels = []
faiss_indices = []

current_index = 0
for example in dataset:
    questions.append(example['question'])
    correct_intro_idx = example['correct_intro_idx']
    split_points = list(map(int, example['ctx_split_points'][1:-1].split(',')))
    split_points = [0] + split_points
    for idx in range(len(split_points) - 1):
        context_chunk = example['context'][split_points[idx]:split_points[idx + 1]]
        contexts.append(context_chunk)
        if idx == correct_intro_idx:
            labels.append(current_index)
        current_index += 1

In [None]:
context_embeddings = get_embeddings(contexts, tokenizer, model)

In [None]:
question_embeddings = get_embeddings(questions, tokenizer, model)

In [None]:
embedding_dim = context_embeddings.shape[1]
faiss_new_index = faiss.IndexFlatL2(embedding_dim)
faiss_new_index.add(context_embeddings)

In [None]:
retrieved_top_1 = []
retrieved_top_5 = []

In [None]:
for idx, q_emb in enumerate(question_embeddings):
    distances, indices = faiss_new_index.search(np.expand_dims(q_emb, axis=0), k=5)
    correct_idx = labels[idx]
    print(indices, correct_idx)
    retrieved_top_1.append(correct_idx in indices[0][:1])
    retrieved_top_5.append(correct_idx in indices[0][:5])

In [None]:
accuracy_top_1 = accuracy_score(retrieved_top_1, [1] * len(retrieved_top_1))
accuracy_top_5 = accuracy_score(retrieved_top_5, [1] * len(retrieved_top_5))

In [None]:
print(f"Fine-Tuned Model Top-1 Accuracy: {accuracy_top_1:.2f}")
print(f"Fine-Tuned Model Top-5 Accuracy: {accuracy_top_5:.2f}")