In [1]:
import os, re, json, copy
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

np.random.seed(7)
torch.manual_seed(7)

<torch._C.Generator at 0x2365f67e2d0>

In [2]:
DATA_PATH = "../data/nexkey_synthetic_dataset_v1"
CKPT_PATH = "../models/checkpoints"

queries = pd.read_csv(f"{DATA_PATH}/queries.csv")
properties = pd.read_csv(f"{DATA_PATH}/properties.csv")
interactions = pd.read_csv(f"{DATA_PATH}/interactions.csv")

print(queries.shape, properties.shape, interactions.shape)

(30000, 16) (15000, 27) (480000, 4)


In [3]:
def property_to_text(row):
    return (
        f"{row['deal_type']} {row['property_type']} in {row['city']} {row['state']}. "
        f"{int(row['beds'])} bed {row['baths']} bath, {int(row['sqft'])} sqft. "
        f"Purchase {int(row['purchase_price'])}, ARV {int(row['arv'])}, "
        f"Entry {int(row['entry_fee'])}, Payment {row['estimated_monthly_payment']}. "
        f"Condition {row['condition']}, Occupancy {row['occupancy']}."
    )

properties["deal_text"] = properties.apply(property_to_text, axis=1)

In [4]:
rng = np.random.RandomState(7)
all_qids = queries["query_id"].unique()
rng.shuffle(all_qids)

n = len(all_qids)
train_qids = set(all_qids[:int(0.80*n)])
val_qids   = set(all_qids[int(0.80*n):int(0.90*n)])
test_qids  = set(all_qids[int(0.90*n):])

test_int = interactions[interactions["query_id"].isin(test_qids)].copy()
print("test_int:", test_int.shape)

test_int: (48000, 4)


In [5]:
gt_test = (
    test_int[test_int["relevance"] >= 2]
    .groupby("query_id")["property_id"]
    .apply(set)
    .to_dict()
)

test_query_ids = list(gt_test.keys())
print("Test queries with >=1 relevant:", len(test_query_ids))

Test queries with >=1 relevant: 3000


In [6]:
def recall_at_k(ranked_pids, relevant_set, k):
    return 1.0 if len(set(ranked_pids[:k]) & relevant_set) > 0 else 0.0

def dcg_at_k(ranked_pids, relevant_set, k):
    dcg = 0.0
    for i, pid in enumerate(ranked_pids[:k], start=1):
        rel = 1.0 if pid in relevant_set else 0.0
        dcg += rel / np.log2(i + 1)
    return dcg

def ndcg_at_k(ranked_pids, relevant_set, k):
    dcg = dcg_at_k(ranked_pids, relevant_set, k)
    ideal_hits = min(len(relevant_set), k)
    idcg = sum(1.0 / np.log2(i + 1) for i in range(1, ideal_hits + 1))
    return dcg / idcg if idcg > 0 else 0.0

In [7]:
with open(f"{CKPT_PATH}/dual_vocab_v1.json", "r") as f:
    dual_vocab = json.load(f)

deal_vecs = np.load(f"{CKPT_PATH}/deal_vecs_v1.npy")
deal_vecs = deal_vecs / (np.linalg.norm(deal_vecs, axis=1, keepdims=True) + 1e-9)

# Cross vocab (saved from notebook 08)
cross_vocab_path = f"{CKPT_PATH}/cross_vocab_v1.json"
if os.path.exists(cross_vocab_path):
    with open(cross_vocab_path, "r") as f:
        cross_vocab = json.load(f)
else:
    cross_vocab = copy.deepcopy(dual_vocab)
    if "<SEP>" not in cross_vocab:
        cross_vocab["<SEP>"] = len(cross_vocab)

PAD_ID_DUAL = dual_vocab["<PAD>"]
UNK_ID_DUAL = dual_vocab["<UNK>"]

PAD_ID = cross_vocab["<PAD>"]
UNK_ID = cross_vocab["<UNK>"]
SEP_ID = cross_vocab["<SEP>"]

print("dual_vocab:", len(dual_vocab))
print("cross_vocab:", len(cross_vocab))
print("deal_vecs:", deal_vecs.shape)

dual_vocab: 18145
cross_vocab: 18146
deal_vecs: (15000, 128)


In [8]:
def tokenize(text: str):
    return re.findall(r"[a-z0-9]+", str(text).lower())

def encode_text_dual(text: str, max_len: int = 48):
    ids = [dual_vocab.get(w, UNK_ID_DUAL) for w in tokenize(text)][:max_len]
    if len(ids) < max_len:
        ids += [PAD_ID_DUAL] * (max_len - len(ids))
    return np.array(ids, dtype=np.int64)

def encode_pair_cross(query_text: str, deal_text: str, max_len: int = 96):
    q_ids = [cross_vocab.get(w, UNK_ID) for w in tokenize(query_text)]
    d_ids = [cross_vocab.get(w, UNK_ID) for w in tokenize(deal_text)]

    q_max = int(max_len * 0.45)
    d_max = max_len - q_max - 1

    q_ids = q_ids[:q_max]
    d_ids = d_ids[:d_max]

    ids = q_ids + [SEP_ID] + d_ids
    if len(ids) < max_len:
        ids += [PAD_ID] * (max_len - len(ids))

    ids = np.array(ids, dtype=np.int64)
    ids = np.clip(ids, 0, len(cross_vocab) - 1)
    return ids

In [9]:
class TextEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, pad_id=0):
        super().__init__()
        self.pad_id = pad_id
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)

    def forward(self, token_ids):
        x = self.emb(token_ids)
        mask = (token_ids != self.pad_id).float().unsqueeze(-1)
        summed = (x * mask).sum(dim=1)
        denom = mask.sum(dim=1).clamp(min=1.0)
        return summed / denom

class DualEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, pad_id=0):
        super().__init__()
        self.query_encoder = TextEncoder(vocab_size, emb_dim, pad_id=pad_id)
        self.deal_encoder  = TextEncoder(vocab_size, emb_dim, pad_id=pad_id)

dual = DualEncoder(vocab_size=len(dual_vocab), emb_dim=128, pad_id=PAD_ID_DUAL)
dual.load_state_dict(torch.load(f"{CKPT_PATH}/dual_encoder_v1.pt", map_location="cpu"))
dual.eval()

class CrossEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden=128, pad_id=0):
        super().__init__()
        self.pad_id = pad_id
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 4)
        )

    def forward(self, token_ids):
        x = self.emb(token_ids)
        mask = (token_ids != self.pad_id).float().unsqueeze(-1)
        pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)
        return self.mlp(pooled)

cross = CrossEncoder(vocab_size=len(cross_vocab), emb_dim=128, hidden=128, pad_id=PAD_ID)
cross.load_state_dict(torch.load(f"{CKPT_PATH}/cross_encoder_best.pt", map_location="cpu"))
cross.eval()

print("Loaded dual + cross ✅")

Loaded dual + cross ✅


In [10]:
MAX_LEN_DUAL = 48
MAX_LEN_CROSS = 96

def retrieve_top_n(prompt: str, top_n: int = 50):
    q_ids = torch.tensor(encode_text_dual(prompt, MAX_LEN_DUAL), dtype=torch.long).unsqueeze(0)
    with torch.no_grad():
        q_vec = dual.query_encoder(q_ids).cpu().numpy()
    q_vec = q_vec / (np.linalg.norm(q_vec, axis=1, keepdims=True) + 1e-9)

    sims = (deal_vecs @ q_vec.T).squeeze(1)
    idx = np.argsort(-sims)[:top_n]
    return idx

def rerank_top_k(prompt: str, top_n: int = 50, top_k: int = 5):
    idx = retrieve_top_n(prompt, top_n=top_n)

    batch = []
    for i in idx:
        batch.append(encode_pair_cross(prompt, properties.iloc[i]["deal_text"], MAX_LEN_CROSS))

    X = torch.tensor(np.stack(batch), dtype=torch.long)
    with torch.no_grad():
        logits = cross(X)
        probs = torch.softmax(logits, dim=1).cpu().numpy()

    expected_rel = (probs * np.array([0,1,2,3], dtype=np.float32)).sum(axis=1)
    order = np.argsort(-expected_rel)[:top_k]
    final_idx = idx[order]
    return properties.iloc[final_idx]["property_id"].tolist()

In [11]:
Ks = [1, 5, 10]
NDCG_K = 5

# sample for speed; increase later
EVAL_N = 1000 if len(test_query_ids) > 1000 else len(test_query_ids)
eval_qids = np.random.choice(test_query_ids, size=EVAL_N, replace=False)

rec_dual = {k: [] for k in Ks}
rec_rerank = {k: [] for k in Ks}
ndcg_dual = []
ndcg_rerank = []

for qid in eval_qids:
    prompt = queries.loc[queries["query_id"] == qid, "query_text"].iloc[0]
    relevant = gt_test[qid]

    # dual-only ranking = top 10 retrieved property IDs
    idx_dual = retrieve_top_n(prompt, top_n=10)
    ranked_dual_pids = properties.iloc[idx_dual]["property_id"].tolist()

    # rerank = top 5
    ranked_rerank_pids = rerank_top_k(prompt, top_n=50, top_k=10)  # get 10 to compute recall@10

    for k in Ks:
        rec_dual[k].append(recall_at_k(ranked_dual_pids, relevant, k))
        rec_rerank[k].append(recall_at_k(ranked_rerank_pids, relevant, k))

    ndcg_dual.append(ndcg_at_k(ranked_dual_pids, relevant, NDCG_K))
    ndcg_rerank.append(ndcg_at_k(ranked_rerank_pids, relevant, NDCG_K))

print("Done ✅")

Done ✅


In [12]:
def mean(x): 
    return float(np.mean(x)) if len(x) else 0.0

rows = []
rows.append({
    "Model": "Dual only (retrieval)",
    "Recall@1": mean(rec_dual[1]),
    "Recall@5": mean(rec_dual[5]),
    "Recall@10": mean(rec_dual[10]),
    "NDCG@5": mean(ndcg_dual),
})
rows.append({
    "Model": "Dual + Cross (rerank)",
    "Recall@1": mean(rec_rerank[1]),
    "Recall@5": mean(rec_rerank[5]),
    "Recall@10": mean(rec_rerank[10]),
    "NDCG@5": mean(ndcg_rerank),
})

report = pd.DataFrame(rows)
report

Unnamed: 0,Model,Recall@1,Recall@5,Recall@10,NDCG@5
0,Dual only (retrieval),0.012,0.063,0.132,0.01292
1,Dual + Cross (rerank),0.022,0.077,0.148,0.01753
