In [12]:
import math
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple

from nltk import word_tokenize
from sentence_transformers import SentenceTransformer
import re
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt_tab to /Users/talha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
DOCS = [
    "Cheap flights to New York from Dubai. Find the best airfare deals.",
    "Looking for affordable airfare to NYC? Compare ticket prices and airlines.",
    "The football world cup draws millions of fans every four years.",
    "Soccer analytics with xG models and player tracking data.",
    "Intro to artificial intelligence: machine learning and neural networks.",
    "AI applications in travel: dynamic pricing and flight delay prediction.",
    "Visit Istanbul for history, food, and Bosphorus cruises.",
    "New York City travel guide: subway tips, museums, and pizza spots.",
    "Basketball playoffs: New York Knicks advance to conference finals.",
    "Air travel tips: baggage rules, layover strategies, and airport lounges."
]

DOC_IDS = [f"D{i:02d}" for i in range(len(DOCS))]
stop_words = set(stopwords.words('english'))

def Tokenize(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", " ", text)
    tokens = word_tokenize(text)
    return [word for word in tokens if word.lower() not in stop_words]

tokens = [Tokenize(d) for d in DOCS]

In [14]:
"""ChatGPT written code - you can use it verbaitm as it is not directly related to the topic.

"""
# -----------------------------
# Vocabulary & statistics
# -----------------------------
def BuildVocabulary(docs_tokens: List[List[str]]) -> Tuple[Dict[str, int], List[int], Dict[str, int]]:
    vocab = {}
    df = {}  # document frequency
    for toks in docs_tokens:
        seen = set()
        for t in toks:
            if t not in vocab:
                vocab[t] = len(vocab)
            if t not in seen:
                df[t] = df.get(t, 0) + 1
                seen.add(t)
    df_list = [0]*len(vocab)
    for t, i in vocab.items():
        df_list[i] = df[t]
    return vocab, df_list, df

VOCAB, DF_LIST, DF_DICT = BuildVocabulary(tokens)
N_DOCS = len(DOCS)
AVGDL = sum(len(toks) for toks in tokens) / N_DOCS

# Precompute term frequencies per doc (sparse dicts)
DOC_TF = []
for toks in tokens:
    tf = {}
    for t in toks:
        tf[t] = tf.get(t, 0) + 1
    DOC_TF.append(tf)

# -----------------------------
# BM25 (Okapi) implementation
# -----------------------------
def BM25Okapi(query: str, k1=1.5, b=0.75) -> np.ndarray:
    q_tokens = Tokenize(query)
    # Use BM25 idf variant
    idf = {}
    for t in q_tokens:
        n_q = DF_DICT.get(t, 0)
        # BM25+Okapi idf, adding +1 inside log to avoid negatives on very common terms in tiny corpora
        idf[t] = math.log((N_DOCS - n_q + 0.5) / (n_q + 0.5) + 1.0)
    scores = np.zeros(N_DOCS, dtype=float)
    for i, tf in enumerate(DOC_TF):
        dl = sum(tf.values())
        denom_norm = (1 - b) + b * (dl / AVGDL)
        s = 0.0
        for t in q_tokens:
            f = tf.get(t, 0)
            if f == 0:
                continue
            s += idf[t] * ( (f * (k1 + 1)) / (f + k1 * denom_norm) )
        scores[i] = s
    return scores

In [15]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddingVectors = embedder.encode(DOCS, convert_to_numpy=True, normalize_embeddings=True)

def QueryEmbedding(query: str) -> np.ndarray:
    return embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]

def CosineSimilarity(q_vec: np.ndarray, doc_matrix: np.ndarray) -> np.ndarray:
    return doc_matrix @ q_vec

  return forward_call(*args, **kwargs)


In [22]:
β = 1e-12

def NormalizeScore(x: np.ndarray) -> np.ndarray:
    mn, mx = x.min(), x.max()
    if mx - mn < β:
        return np.zeros_like(x)
    return (x - mn) / (mx - mn)

def HybridSearch(query: str, α: float = 0.5) -> pd.DataFrame:
    bm25Score =NormalizeScore(BM25Okapi(query))
    semanticSimilarity = NormalizeScore(CosineSimilarity(QueryEmbedding(query), embeddingVectors))

    hybridScore = α * bm25Score + (1 - α) * semanticSimilarity

    df = pd.DataFrame({
        "id": DOC_IDS,
        "document": DOCS,
        "BM25": bm25Score,
        "Cos Similarity": semanticSimilarity,
        f"Hybrid (alpha={α:.2f})": hybridScore
    }).sort_values(by=f"Hybrid (alpha={α:.2f})", ascending=False).reset_index(drop=True)
    return df

In [23]:
resultsDF = HybridSearch("cheap flights to New York", α=0.3)

resultsDF

  return forward_call(*args, **kwargs)


Unnamed: 0,id,document,BM25,Cos Similarity,Hybrid (alpha=0.30)
0,D00,Cheap flights to New York from Dubai. Find the...,1.0,0.951896,0.966327
1,D01,Looking for affordable airfare to NYC? Compare...,0.0,1.0,0.7
2,D07,"New York City travel guide: subway tips, museu...",0.346635,0.709671,0.60076
3,D05,AI applications in travel: dynamic pricing and...,0.0,0.544978,0.381485
4,D08,Basketball playoffs: New York Knicks advance t...,0.385365,0.363443,0.370019
5,D09,"Air travel tips: baggage rules, layover strate...",0.0,0.451535,0.316074
6,D06,"Visit Istanbul for history, food, and Bosphoru...",0.0,0.29584,0.207088
7,D03,Soccer analytics with xG models and player tra...,0.0,0.070316,0.049221
8,D02,The football world cup draws millions of fans ...,0.0,0.05969,0.041783
9,D04,Intro to artificial intelligence: machine lear...,0.0,0.0,0.0
