In [7]:
# Install dependencies (fast in Colab)
!pip install llama-index faiss-cpu sentence-transformers tqdm llama-index-vector-stores-faiss llama-index-embeddings-huggingface

Collecting llama-index-vector-stores-faiss
  Downloading llama_index_vector_stores_faiss-0.5.1-py3-none-any.whl.metadata (377 bytes)
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl.metadata (458 bytes)
Downloading llama_index_vector_stores_faiss-0.5.1-py3-none-any.whl (7.6 kB)
Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl (8.9 kB)
Installing collected packages: llama-index-vector-stores-faiss, llama-index-embeddings-huggingface
Successfully installed llama-index-embeddings-huggingface-0.6.1 llama-index-vector-stores-faiss-0.5.1


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
docs_dir = "/content/drive/MyDrive/chemtutor/docs"
persist_dir = "/content/drive/MyDrive/chemtutor/faiss_index"


In [3]:
# List files in the documents directory
!ls "{docs_dir}"

Chemistry2e-WEB.pdf


In [10]:
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SimpleNodeParser
import faiss, os

os.makedirs(persist_dir, exist_ok=True)

# Load docs
print("📥 Loading documents...")
documents = SimpleDirectoryReader(docs_dir).load_data()

# Split docs into chunks
print("🔍 Splitting documents...")
parser = SimpleNodeParser.from_defaults(chunk_size=500, chunk_overlap=50)
nodes = parser.get_nodes_from_documents(documents)

print(f"🧠 Embedding {len(nodes)} chunks...")
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS index (no faiss_index_path anymore)
dim = 384
faiss_index = faiss.IndexFlatL2(dim)
vector_store = FaissVectorStore(faiss_index=faiss_index)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Build + save index
index = VectorStoreIndex(nodes, embed_model=embed_model, storage_context=storage_context)
index.storage_context.persist(persist_dir=persist_dir)

print("✅ FAISS index built and saved at:", persist_dir)


📥 Loading documents...
🔍 Splitting documents...
🧠 Embedding 1927 chunks...
✅ FAISS index built and saved at: /content/drive/MyDrive/chemtutor/faiss_index
