<a href="https://colab.research.google.com/github/Abhijit8229/Semantic-Search-on-Twitter-API-Documentation/blob/main/Semantic_Search_on_Twitter_API_Documentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ============================================================
# FULL SEMANTIC SEARCH PIPELINE + CLI (SINGLE CELL FOR COLAB)
# ============================================================

!pip install sentence-transformers faiss-cpu gitpython > /dev/null

import os, glob, json, argparse, sys
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# ----------------------------
# STEP 1 — Clone Repo
# ----------------------------
REPO_URL = "https://github.com/xdevplatform/postman-twitter-api"

if not os.path.exists("postman-twitter-api"):
    !git clone {REPO_URL} > /dev/null

# ----------------------------
# STEP 2 — Load Documentation
# ----------------------------
def load_docs(repo_path="postman-twitter-api"):
    docs = []
    files = glob.glob(f"{repo_path}/**/*", recursive=True)
    for f in files:
        if os.path.isfile(f) and (f.endswith(".md") or f.endswith(".json") or f.endswith(".txt")):
            try:
                docs.append({"path": f, "text": open(f, "r", encoding="utf-8").read()})
            except:
                pass
    return docs

docs = load_docs()

# ----------------------------
# STEP 3 — Chunking
# ----------------------------
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

all_chunks = []
for doc in docs:
    for c in chunk_text(doc["text"], chunk_size=300):
        all_chunks.append({"text": c, "source": doc["path"]})

chunk_texts = [c["text"] for c in all_chunks]

# ----------------------------
# STEP 4 — Embeddings
# ----------------------------
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
chunk_embeddings = model.encode(chunk_texts, show_progress_bar=True)

# ----------------------------
# STEP 5 — Build FAISS Index
# ----------------------------
emb_dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(emb_dim)
index.add(np.array(chunk_embeddings).astype("float32"))

# Save index + chunks
faiss.write_index(index, "index.faiss")
json.dump(all_chunks, open("chunks.json", "w"))


# ============================================================
# CLI IMPLEMENTATION
# ============================================================

def semantic_search(query, model, index, chunks, top_k=5):
    q_embed = model.encode([query])
    distances, indices = index.search(np.array(q_embed).astype("float32"), top_k)

    results = []
    for rank, idx in enumerate(indices[0]):
        results.append({
            "rank": rank + 1,
            "score": float(distances[0][rank]),
            "chunk": chunks[idx]["text"],
            "source": chunks[idx]["source"]
        })
    return results


def cli():
    parser = argparse.ArgumentParser(description="Semantic Search CLI")
    parser.add_argument("--query", required=True, help="Search query")
    parser.add_argument("--top_k", type=int, default=5)

    args = parser.parse_args()

    # Load components
    print("Loading model...")
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

    print("Loading index...")
    index = faiss.read_index("index.faiss")

    print("Loading chunks...")
    chunks = json.load(open("chunks.json"))

    # Search
    results = semantic_search(args.query, model, index, chunks, args.top_k)

    print(json.dumps({
        "query": args.query,
        "results": results
    }, indent=2))


# Allow running both from Colab and from CLI
if __name__ == "__main__":
    if "google.colab" in sys.modules:
        print("\n✔ Pipeline is ready!")
        print("Run CLI like this:\n")
        print("    !python search.py --query \"get tweets\" --top_k 5\n")
    else:
        cli()


Batches:   0%|          | 0/7 [00:00<?, ?it/s]


✔ Pipeline is ready!
Run CLI like this:

    !python search.py --query "get tweets" --top_k 5



In [5]:
%%writefile search.py
from sentence_transformers import SentenceTransformer
import faiss, json, argparse, numpy as np

def semantic_search(query, model, index, chunks, top_k=5):
    q_embed = model.encode([query])
    distances, indices = index.search(np.array(q_embed).astype("float32"), top_k)
    return [
        {
            "rank": i + 1,
            "score": float(distances[0][i]),
            "chunk": chunks[idx]["text"],
            "source": chunks[idx]["source"]
        }
        for i, idx in enumerate(indices[0])
    ]

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--query", required=True)
    parser.add_argument("--top_k", type=int, default=5)
    args = parser.parse_args()

    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    index = faiss.read_index("index.faiss")
    chunks = json.load(open("chunks.json"))

    results = semantic_search(args.query, model, index, chunks, args.top_k)
    print(json.dumps(results, indent=2))

if __name__ == "__main__":
    main()


Writing search.py


In [6]:
!python search.py --query "get tweets" --top_k 5

2025-11-17 09:20:07.414087: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763371207.449100    5677 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763371207.460297    5677 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763371207.498227    5677 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763371207.498286    5677 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763371207.498293    5677 computation_placer.cc:177] computation placer alr