In [1]:
import json

with open("tds_discourse_posts.json", "r", encoding="utf-8") as f:
    discourse_data = json.load(f)

discourse_chunks = []
for item in discourse_data:
    for post in item["posts"]:
        if post.strip():
            discourse_chunks.append({
                "source": f"Discourse | {item['title']}",
                "text": post.strip()
            })


In [2]:
import os

course_chunks = []

for filename in os.listdir("scraped_tds"):  # your markdown folder
    if filename.endswith(".md"):
        with open(os.path.join("scraped_tds", filename), "r", encoding="utf-8") as f:
            text = f.read()
            course_chunks.append({
                "source": f"Course | {filename}",
                "text": text
            })


In [12]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Use a local free embedding model from Hugging Face
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Split documents
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

docs = [
    Document(page_content=chunk["text"], metadata={"source": chunk["source"]})
    for chunk in (course_chunks + discourse_chunks)
]

split_docs = splitter.split_documents(docs)

# Vectorize and save
vectorstore = FAISS.from_documents(split_docs, embeddings)
vectorstore.save_local("tds_vector_db")

