In [None]:
'''
Building a system that handles rule-based (dictionary/ontology) and LLM-based (embeddings, semantic search) approaches, 
ensuring it scales effectively and delivers high-quality matches
'''

In [None]:
#Implementation Plan
#Install Required Libraries:

#pip install spacy sentence-transformers faiss-cpu flask
#Optional: pandas, numpy for structured data handling, and gunicorn for deployment.
'''
Structure:

Preprocessing: Normalize inputs.
Rule-Based Matching: Use dictionaries or regex for exact matches.
LLM-Based Matching: Use Sentence-BERT for semantic search.
Integration: Combine scores for hybrid results.
API Interface: Flask for query submission and results retrieval.
'''

In [None]:
import spacy
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np
from flask import Flask, request, jsonify

# Initialize components
nlp = spacy.load("en_core_web_sm")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
app = Flask(__name__)

# Rule-based dictionary (example)
RULE_BASED_DICT = {
    "education": ["school", "university", "teaching"],
    "artificial intelligence": ["AI", "machine learning", "deep learning"],
}

# Sample dataset for semantic search
DATASET = [
    {"id": 1, "text": "AI tools for teaching and learning."},
    {"id": 2, "text": "Best practices in university-level education."},
    {"id": 3, "text": "Machine learning models for beginners."},
]
# Generate embeddings for the dataset
DATASET_TEXTS = [entry["text"] for entry in DATASET]
DATASET_EMBEDDINGS = embedding_model.encode(DATASET_TEXTS, convert_to_tensor=False)

# Create FAISS index
embedding_dim = DATASET_EMBEDDINGS.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(DATASET_EMBEDDINGS))

# Preprocessing function
def preprocess(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

# Rule-based matching
def rule_based_match(query):
    query_terms = set(query.split())
    matched_rules = {
        key: [term for term in value if term in query_terms]
        for key, value in RULE_BASED_DICT.items()
        if any(term in query_terms for term in value)
    }
    return matched_rules

# LLM-based matching
def llm_based_match(query, top_k=3):
    query_embedding = embedding_model.encode(query, convert_to_tensor=False)
    distances, indices = index.search(np.array([query_embedding]), top_k)
    results = [
        {"id": DATASET[idx]["id"], "text": DATASET[idx]["text"], "score": 1 - dist}
        for idx, dist in zip(indices[0], distances[0])
    ]
    return results

# Hybrid matching
def hybrid_match(query, rule_weight=0.6, llm_weight=0.4, top_k=3):
    query = preprocess(query)
    
    # Rule-based matching
    rule_matches = rule_based_match(query)
    rule_score = len(rule_matches) * rule_weight  # Example scoring logic

    # LLM-based matching
    llm_matches = llm_based_match(query, top_k)
    llm_score = sum(match["score"] for match in llm_matches) / len(llm_matches) * llm_weight

    # Combined score
    combined_score = rule_score + llm_score
    return {
        "query": query,
        "rule_matches": rule_matches,
        "llm_matches": llm_matches,
        "combined_score": combined_score,
    }

# API Endpoint
@app.route("/match", methods=["POST"])
def match():
    data = request.json
    query = data.get("query", "")
    if not query:
        return jsonify({"error": "Query is required"}), 400

    results = hybrid_match(query)
    return jsonify(results)

# Run the app
if __name__ == "__main__":
    app.run(debug=True)


In [None]:
#How It Works
#Preprocessing:
'''
Lemmatizes and removes stopwords from the query for cleaner inputs.
Rule-Based Matching:

Uses a predefined dictionary to detect exact or close matches.
LLM-Based Matching:

Converts the query and dataset into embeddings using SentenceTransformer.
Performs semantic search with FAISS.
Hybrid Scoring:

Combines the rule-based and LLM-based scores with weights for flexibility.
API Endpoint:

Exposes a /match endpoint for users to submit queries and receive results.
'''

In [None]:
#Example Usage
#Start the server:
python app.py

In [None]:
#Send a query (using Postman, Curl, or similar):
curl -X POST http://127.0.0.1:5000/match -H "Content-Type: application/json" -d '{"query": "AI software for education"}'

In [None]:
#Example Response:
{
  "query": "ai software education",
  "rule_matches": {
    "artificial intelligence": ["AI"],
    "education": ["education"]
  },
  "llm_matches": [
    {"id": 1, "text": "AI tools for teaching and learning.", "score": 0.92},
    {"id": 2, "text": "Best practices in university-level education.", "score": 0.87},
    {"id": 3, "text": "Machine learning models for beginners.", "score": 0.80}
  ],
  "combined_score": 0.83
}

In [None]:
'''
Scalability Enhancements
Use Gunicorn or uWSGI for production deployment.
Store embeddings and rules in a database (PostgreSQL, MongoDB).
Scale FAISS with distributed systems like Milvus or Weaviate.
Fine-tune the embedding model for domain-specific data.
'''