In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:0

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np

# Load dataset
dataset = load_dataset('microsoft/wiki_qa', split='train[:1000]')

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

contexts = [item['answer'] for item in dataset]
embeddings = model.encode(contexts)



In [None]:
#print some example from dataset
for item in dataset:
  print(item["question"], item["answer"])

In [None]:
import faiss

#index embeddings
embedding_dim = embeddings.shape[1]

index = faiss.IndexFlatL2(embedding_dim)

faiss.normalize_L2(embeddings)
index.add(np.array(embeddings))

In [None]:
query = "how are glacier caves forme?"
query_embedding = model.encode([query])

k = 3
distances, indices = index.search(np.array(query_embedding), k)

closest_docs = [contexts[i] for i in indices[0]]


In [None]:
closest_docs

['Ice formations in the Titlis glacier cave',
 'A glacier cave is a cave formed within the ice of a glacier .',
 'A partly submerged glacier cave on Perito Moreno Glacier .']

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

#inference
tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

combined_docs = " ".join(closest_docs)


input_text = f"summarize: {combined_docs}"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

outputs = t5_model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Response:", response)

Generated Response: glacier cave formed within the ice of a glacier.
