<a href="https://colab.research.google.com/github/A-ManiMekhala/Semantic-Search-on-Twitter-API-Documentation/blob/main/Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/xdevplatform/postman-twitter-api


Cloning into 'postman-twitter-api'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 65 (delta 9), reused 0 (delta 0), pack-reused 53 (from 1)[K
Receiving objects: 100% (65/65), 125.58 KiB | 4.33 MiB/s, done.
Resolving deltas: 100% (31/31), done.


In [None]:
!pip install sentence-transformers



In [None]:
!pip install transformers torch numpy pandas scikit-learn faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [None]:
!pip install requests tqd5



In [None]:
import argparse
import json
import os
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

def load_and_chunk_docs(repo_path="postman-twitter-api"):
    chunks = []
    metadata = []

    # Focus only on Markdown files within the repository structure
    markdown_files = []
    for root, _, files in os.walk(repo_path):
        for f in files:
            if f.endswith('.md'):
                markdown_files.append(os.path.join(root, f))

    for file_path in markdown_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Split aggressively by level 2 and level 3 headings (##, ###)
        sections = re.split(r'\n(##+)\s', content)

        # The first part is handled separately
        if sections:
            # First element is usually the content before the first heading
            if sections[0].strip():
                title = file_path.split(os.path.sep)[-1]
                chunks.append(f"Document Title: {title}\n{sections[0].strip()}")
                metadata.append({"source": file_path, "heading": "Document Start"})

            # Iterate through the split sections (Heading level, Heading text + Body)
            i = 1
            while i < len(sections):
                # sections[i] is the heading level (e.g., '##' or '###')
                # sections[i+1] is the heading text and the body content that follows
                if i + 1 < len(sections):
                    heading_text_and_body = sections[i+1]

                    # Split heading text from body
                    lines = heading_text_and_body.split('\n', 1)
                    heading = lines[0].strip()
                    body = lines[1].strip() if len(lines) > 1 else ""

                    # Further split long body by paragraph (double newline)
                    paragraphs = body.split('\n\n')

                    for p_index, paragraph in enumerate(paragraphs):
                        if paragraph.strip():
                            chunk_text = f"Section: {heading}\n{paragraph.strip()}"
                            chunks.append(chunk_text)
                            metadata.append({"source": file_path, "heading": heading, "chunk_index": p_index})

                i += 2 # Move to the next heading level/content pair

    return chunks, metadata

def build_index(chunks):
    model_name = 'all-MiniLM-L6-v2'
    model = SentenceTransformer(model_name)

    embeddings = model.encode(chunks, convert_to_tensor=False)

    # L2 Normalization for accurate cosine similarity via L2 distance
    embeddings = normalize(embeddings, norm='l2', axis=1)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.astype('float32'))

    return model, index

def semantic_search(query_text, model, index, chunks, metadata, k=5):
    # Embed and normalize the query
    query_embedding = model.encode([query_text], convert_to_tensor=False)
    query_embedding = normalize(query_embedding, norm='l2', axis=1)

    # Search the FAISS index
    D, I = index.search(query_embedding.astype('float32'), k)

    results = []
    for rank, (score, idx) in enumerate(zip(D[0], I[0])):
        if idx >= len(chunks):
            continue

        result_meta = metadata[idx].copy()

        # L2 Distance score is outputted. Lower score = closer/more relevant.
        result_meta.update({
            "rank": rank + 1,
            "relevance_score": float(score),
            "chunk_text": chunks[idx]
        })
        results.append(result_meta)

    print(json.dumps(results, indent=2))

    return results

def main_search():
    parser = argparse.ArgumentParser()
    parser.add_argument('-query', type=str, required=True, help='The query to search for.')
    parser.add_argument('-k', type=int, default=3, help='The number of top results to retrieve.')

    args, _ = parser.parse_known_args()

    chunks, metadata = load_and_chunk_docs()
    model, index = build_index(chunks)

    semantic_search(args.query, model, index, chunks, metadata, k=args.k)

if __name__ == "__main__":
    import sys
    # Simulating command-line call: python semantic_search.py -query "How do I fetch tweets with expansions?" -k 3
    sys.argv = ['semantic_search.py', '-query', 'How do I fetch tweets with expansions?', '-k', '5']
    main_search()

[
  {
    "source": "postman-twitter-api/README.md",
    "heading": "Document Start",
    "rank": 1,
    "relevance_score": 1.055870532989502,
    "chunk_text": "Document Title: README.md\nThis is a Postman Collection for the Twitter API v2 endpoints.\n\nRefer to the main [Twitter API documentation](https://developer.twitter.com/en/docs) for more details.\n\nIf you have an API-related question, you can also discuss in the developer [community forum](https://twittercommunity.com)."
  },
  {
    "source": "postman-twitter-api/README.md",
    "heading": "Manual install",
    "chunk_index": 0,
    "rank": 2,
    "relevance_score": 1.1232539415359497,
    "chunk_text": "Section: Manual install\nYou can also download this Collection from a GitHub repo here: https://github.com/twitterdev/postman-twitter-api"
  },
  {
    "source": "postman-twitter-api/README.md",
    "heading": "Quick install",
    "chunk_index": 0,
    "rank": 3,
    "relevance_score": 1.1481720209121704,
    "chunk_text": "