In [21]:
import numpy as np
import time
import math
import torch
from datasets import load_dataset, Dataset
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab') 
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel,
    PeftConfig
)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Fedor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer", split="test")

batch_size = 1000

In [3]:
empty_questions = 0
empty_answers = 0

for i in range(0, len(dataset), batch_size):
    indices = range(i, min(i + batch_size, len(dataset)))
    chunk = dataset.select(indices)
    print(chunk[:5])
    df_chunk = chunk.to_pandas()

    empty_questions += df_chunk["question"].isna().sum()
    empty_answers += df_chunk["answer"].isna().sum()

print(empty_questions)
print(empty_answers)

{'question': ['Was Abraham Lincoln the sixteenth President of the United States?', 'Did Lincoln sign the National Banking Act of 1863?', 'Did his mother die of pneumonia?', "How many long was Lincoln's formal education?", 'When did Lincoln begin his political career?'], 'answer': ['yes', 'yes', 'no', '18 months', '1832'], 'id': [0, 2, 4, 6, 8]}
0
0


In [4]:
# Full-text method - BM25
def bm25_search(query, dataset, top_n=5, batch_size=1000):
    tokenized_query = word_tokenize(query.lower())
    all_scores = []

    all_questions = dataset['question']
    all_answers = dataset['answer']

    for i in range(0, len(dataset), batch_size):
        chunk_questions = all_questions[i:i + batch_size]
        tokenized_passages = [word_tokenize(q.lower()) for q in chunk_questions]

        bm25 = BM25Okapi(tokenized_passages)
        scores = bm25.get_scores(tokenized_query)

        global_indices = list(range(i, min(i + batch_size, len(dataset))))
        all_scores.extend(zip(global_indices, scores))

    sorted_scores = sorted(all_scores, key=lambda x: x[1], reverse=True)[:top_n]

    results = []
    for idx, score in sorted_scores:
        results.append({
            "score": round(score, 2),
            "question": all_questions[idx],
            "answer": all_answers[idx]
        })

    return results

In [5]:
# Vector method - Sentence-BERT
bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = []

for i in range(0, len(dataset), batch_size):
    batch_questions = dataset['question'][i:i + batch_size]
    batch_embeddings = bert_model.encode(batch_questions, batch_size=64)
    embeddings.append(batch_embeddings)

embeddings = np.vstack(embeddings)

def bert_search(query, dataset, embeddings, top_n=5):
    query_embedding = bert_model.encode([query])
    scores = cosine_similarity(query_embedding, embeddings)[0]
    top_indices = np.argsort(scores)[::-1][:top_n]

    results = []
    for idx in top_indices:
        idx = int(idx)
        results.append({
            "score": round(scores[idx], 2),
            "question": dataset['question'][idx],
            "answer": dataset['answer'][idx]
        })

    return results

In [6]:
# Hybrid method - BERT + BM25
def normalize_scores(score_dict):
    scores = list(score_dict.values())
    min_score = min(scores)
    max_score = max(scores)
    if max_score == min_score:
        return {k: 0.0 for k in score_dict} 
    return {k: (v - min_score) / (max_score - min_score) for k, v in score_dict.items()}

def hybrid_search(query, dataset, bert_embeddings, top_n=5, alpha=0.6):
    bm25_results = bm25_search(query, dataset, top_n=100)
    bert_results = bert_search(query, dataset, bert_embeddings, top_n=100)

    bm25_scores = {res['question']: res['score'] for res in bm25_results}
    bert_scores = {res['question']: res['score'] for res in bert_results}
    bm25_scores_norm = normalize_scores(bm25_scores)
    bert_scores_norm = normalize_scores(bert_scores)

    all_questions = set(bm25_scores.keys()) | set(bert_scores.keys())

    hybrid_results = []
    for question in all_questions:
        bm_score = bm25_scores_norm.get(question, 0.0)
        bert_score = bert_scores_norm.get(question, 0.0)
        hybrid_score = round(alpha * bm_score + (1 - alpha) * bert_score, 4)

        answer = next(
            (res['answer'] for res in bm25_results if res['question'] == question),
            next((res['answer'] for res in bert_results if res['question'] == question), "")
        )

        hybrid_results.append({
            "score": hybrid_score,
            "question": question,
            "answer": answer
        })

    hybrid_results = sorted(hybrid_results, key=lambda x: -x["score"])[:top_n]
    return hybrid_results

In [7]:
# Test
if __name__ == "__main__":
    query = "Who was Abraham Lincoln?"
    # results = bm25_search(query, dataset, top_n=5)
    results = bert_search(query, dataset, embeddings, top_n=5)
    # results = hybrid_search(query, dataset, bert_embeddings=embeddings, top_n=5)

    print(f"\nTop results for query: '{query}'")
    for res in results:
        print(f"[{res['score']}] Q: {res['question']} → A: {res['answer']}")


Top results for query: 'Who was Abraham Lincoln?'
[0.8500000238418579] Q: Was Abraham Lincoln the first President of the United States? → A: No
[0.8100000023841858] Q: Was Abraham Lincoln the sixteenth President of the United States? → A: yes
[0.7900000214576721] Q: Did Abraham Lincoln live in the Frontier? → A: Yes
[0.7900000214576721] Q: When did Lincoln first serve as President? → A: March 4, 1861
[0.7799999713897705] Q: Who assassinated Lincoln? → A: John Wilkes Booth


In [8]:
# LLM
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
)
model = get_peft_model(model, config)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


In [9]:
def convert_batch(batch):
    return {
        "instruction": ["Answer the question."] * len(batch["question"]),
        "input": batch["question"],
        "output": batch["answer"]
    }

dataset = dataset.map(convert_batch, batched=True, batch_size=32)

def preprocess(example):
    input_texts = [f"{instr} {inp}" for instr, inp in zip(example['instruction'], example['input'])]
    model_inputs = tokenizer(input_texts, truncation=True, padding="max_length", max_length=128)

    labels = tokenizer(example["output"], truncation=True, padding="max_length", max_length=32)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True, batch_size=32, remove_columns=dataset.column_names)


In [10]:
# training_args = TrainingArguments(
#     output_dir="./flan-t5-lora-rag",
#     learning_rate=5e-5,
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     logging_steps=10,
#     report_to="none"
# )

# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator
# )

# trainer.train()

In [11]:
# model.save_pretrained("flan-t5-lora-trained")
# tokenizer.save_pretrained("flan-t5-lora-trained")

In [12]:
base_model_name = "flan-t5-lora-trained"
model = T5ForConditionalGeneration.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(model, "flan-t5-lora-trained")
tokenizer = T5Tokenizer.from_pretrained("flan-t5-lora-trained")
model.eval()



PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

In [13]:
input_text = "Answer the question: Who was Abraham Lincoln?"
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))

a lawyer


In [14]:
# Perplexity
def calculate_perplexity(model, tokenizer, input_text, target_text):
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids
    target_ids = tokenizer(target_text, return_tensors="pt", truncation=True).input_ids
    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=target_ids)
        loss = outputs.loss
    return torch.exp(loss).item()

In [23]:
def search_and_evaluate(query, dataset, embeddings, model, tokenizer, method="bm25", top_n=5):
    start_time = time.time()

    if method == "bm25":
        results = bm25_search(query, dataset, top_n=top_n)
    elif method == "bert":
        results = bert_search(query, dataset, embeddings, top_n=top_n)
    elif method == "hybrid":
        results = hybrid_search(query, dataset, embeddings, top_n=top_n)
    else:
        raise ValueError("Invalid method: choose from 'bm25', 'bert', or 'hybrid'")

    duration = time.time() - start_time

    qa_subset = [{"question": res["question"], "answer": res["answer"]} for res in results]
    from datasets import Dataset
    qa_dataset = Dataset.from_list(qa_subset)

    model.eval()
    total_loss = 0
    total_tokens = 0

    for sample in qa_dataset:
        input_text = f"Answer the question: {sample['question']}"
        answer_text = sample['answer']

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        targets = tokenizer(text_target=[answer_text], return_tensors="pt", truncation=True, padding="max_length", max_length=32)

        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        labels = targets["input_ids"]
        labels[labels == tokenizer.pad_token_id] = -100

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        num_tokens = (labels != -100).sum().item()
        total_loss += loss.item() * num_tokens
        total_tokens += num_tokens

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)

    return {
        "results": results,
        "duration_sec": round(duration, 4),
        "metrics": {
            "perplexity": round(perplexity, 4)
        }
    }

for method in ["bm25", "bert", "hybrid"]:
    print(f"Method: {method.upper()}")
    output = search_and_evaluate(
        query="Who was Abraham Lincoln?",
        dataset=dataset,
        embeddings=embeddings,
        model=model,           
        tokenizer=tokenizer,   
        method=method,
        top_n=5
    )

    print(f"Time: {output['duration_sec']} сек")
    print(f"Perplexity: {output['metrics']['perplexity']}")
    for res in output["results"]:
        print(f"  [{res['score']}] Q: {res['question']} → A: {res['answer']}")


Method: BM25
Time: 0.1147 сек
Perplexity: 32.8526
  [11.93] Q: Was Abraham Lincoln the sixteenth President of the United States? → A: yes
  [11.93] Q: Was Abraham Lincoln the first President of the United States? → A: No
  [11.82] Q: Did Abraham Lincoln live in the Frontier? → A: Yes
  [10.8] Q: Who assassinated Lincoln? → A: John Wilkes Booth
  [9.14] Q: Who suggested Lincoln grow a beard? → A: 11-year-old Grace Bedell
Method: BERT
Time: 0.0332 сек
Perplexity: 14.7151
  [0.8500000238418579] Q: Was Abraham Lincoln the first President of the United States? → A: No
  [0.8100000023841858] Q: Was Abraham Lincoln the sixteenth President of the United States? → A: yes
  [0.7900000214576721] Q: Did Abraham Lincoln live in the Frontier? → A: Yes
  [0.7900000214576721] Q: When did Lincoln first serve as President? → A: March 4, 1861
  [0.7799999713897705] Q: Who assassinated Lincoln? → A: John Wilkes Booth
Method: HYBRID
Time: 0.4056 сек
Perplexity: 20.9306
  [1.0] Q: Was Abraham Lincoln the fi