In [1]:
import re
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "google/flan-t5-base" 
retriever_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

def get_clean_micro_chunks(file_path, size=150, overlap=50):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), size - overlap):
        chunk = " ".join(words[i:i + size])
        if len(chunk.split()) > 50:
            chunks.append(chunk)
            
    return chunks

def answer_question_gpu(query, chunks, chunk_embeddings, threshold=0.40):
    query_vec = retriever_model.encode([query], convert_to_tensor=True)
    
    if not torch.is_tensor(chunk_embeddings):
        chunk_embeddings = torch.tensor(chunk_embeddings).to(device)
    
    scores = torch.mm(query_vec, chunk_embeddings.T).flatten()
    best_idx = torch.argmax(scores).item()
    
    if scores[best_idx] < threshold:
        return "Not enough information in the Simple Wikipedia dataset."

    context = chunks[best_idx]
    prompt = (
        f"Context: {context}\n\n"
        f"Task: Explain '{query}' using the context in two full, simple sentences. "
        "If the context is unrelated, say 'Not enough information in the Simple Wikipedia dataset.'\n"
        "Answer:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=100, 
            num_beams=5, 
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if len(answer.split()) <= 5:
        return "Not enough information in the Simple Wikipedia dataset."
        
    return answer

source_file = "/kaggle/input/datasets/ffatty/plain-text-wikipedia-simpleenglish/AllCombined.txt"
chunks = get_clean_micro_chunks(source_file)
chunk_embeddings = retriever_model.encode(chunks, convert_to_tensor=True, show_progress_bar=True)

query = "What is the universe?"
response = answer_question_gpu(query, chunks, chunk_embeddings)
print(response)

2026-02-10 20:39:52.107664: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770755992.345716      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770755992.416592      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770755993.014603      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770755993.014655      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770755993.014659      55 computation_placer.cc:177] computation placer alr

Running on: cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Batches:   0%|          | 0/8995 [00:00<?, ?it/s]

The most common term for "universe" among the ancient Greek philosophers


In [29]:
print(answer_question_gpu("what is a stream", chunks, chunk_embeddings))

A stream is a natural flow of water moving across land between banks


In [34]:
print(answer_question_gpu("what is photosynthesis", chunks, chunk_embeddings))

Oxygen is produced as a result of photosynthesis


In [35]:
print(answer_question_gpu("what is brain", chunks, chunk_embeddings))

The brain is the part of the body which lets living beings think.


In [37]:
print(answer_question_gpu("what is grenade", chunks, chunk_embeddings))

A hand grenade is a small explosive device


In [41]:
print(answer_question_gpu("what is cyanide", chunks, chunk_embeddings))

Hydrocyanide is a gas and kills by inhalation.


In [69]:
print(answer_question_gpu("what is a hydrogen bomb", chunks, chunk_embeddings))

Not enough information in the Simple Wikipedia dataset.


In [77]:
import os
import torch
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = "cuda" if torch.cuda.is_available() else "cpu"

data_path = "/kaggle/input/datasets/ffatty/plain-text-wikipedia-simpleenglish/AllCombined.txt"
loader = TextLoader(data_path)
documents = loader.load()


In [None]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': device}
)

vector_db = FAISS.from_documents(chunks, embeddings)
retriever = vector_db.as_retriever(search_kwargs={"k": 3})

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16
).to(device)

generation_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    do_sample=True,
    temperature=0.3,
    repetition_penalty=2.5,
    num_beams=5
)

llm = HuggingFacePipeline(pipeline=generation_pipeline)

template = """
Answer the question based ONLY on the context provided. 
Context: {context}

Question: {question}
Detailed Answer:"""

prompt_template = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

user_query = "What is a galaxy?"
response = rag_chain.invoke(user_query)

print(f"\nQUERY: {user_query}")
print("-" * 30)
print(response)

In [66]:
print(rag_chain.invoke("what is a hydrogen bomb used for"))

detonate a casing made of natural uranium


In [70]:
print(rag_chain.invoke("what is photosynthesis"))

Photosynthesis is vital for life on Earth. Before photosynthesis, Earth had no free oxygen in its atmosphere. Green plants build themselves using photosynthesis. Algae, protists and some bacteria also use it. Some exceptions are organisms that directly get their energy from chemical reactions; these organisms are called chemoautotrophs.


In [75]:
print(rag_chain.invoke("why is my laptop so bad"))

Computers can become obsolete quickly, depending on what programs the user runs. Very often, they are thrown away within two or three years, because some newer programs require a more powerful computer. This makes the problem worse, so computer recycling happens a lot. Many projects try to send working computers to developing nations so they can be re-used and will not become waste as quickly, as most people do not need to run new programs. Some computer parts, such as hard drives, can break easily. When these parts end up in the landfill, they can put poisonous chemicals like lead into the emulator. Memory (both RAM and hard drive space) may be another factor to help it run smoothly and more accurately. With the


In [63]:
print(rag_chain.invoke("why is cocaine the best drug"))

Because it is a stimulant, cocaine gives people energy. It also makes people feel very happy when it is taken.
