In [1]:
import os, re, json
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
DATA_PATH = "../data/nexkey_synthetic_dataset_v1"

queries = pd.read_csv(f"{DATA_PATH}/queries.csv")
properties = pd.read_csv(f"{DATA_PATH}/properties.csv")
interactions = pd.read_csv(f"{DATA_PATH}/interactions.csv")

# Load vocab from dual-encoder step (important: consistent encoding)
with open("../models/checkpoints/dual_vocab_v1.json", "r") as f:
    vocab = json.load(f)

deal_vecs = np.load("../models/checkpoints/deal_vecs_v1.npy")

print("Loaded data + vocab + deal embeddings ✅")
print("deal_vecs:", deal_vecs.shape)

Loaded data + vocab + deal embeddings ✅
deal_vecs: (15000, 128)


In [3]:
def property_to_text(row):
    return (
        f"{row['deal_type']} {row['property_type']} in {row['city']} {row['state']}. "
        f"{int(row['beds'])} bed {row['baths']} bath, {int(row['sqft'])} sqft. "
        f"Purchase {int(row['purchase_price'])}, ARV {int(row['arv'])}, "
        f"Entry {int(row['entry_fee'])}, Payment {row['estimated_monthly_payment']}. "
        f"Condition {row['condition']}, Occupancy {row['occupancy']}."
    )

properties["deal_text"] = properties.apply(property_to_text, axis=1)

In [4]:
import copy
import re

def tokenize(text):
    return re.findall(r"[a-z0-9]+", text.lower())

# --- IMPORTANT: keep the dual vocab EXACTLY as saved ---
dual_vocab = vocab  # this is loaded from dual_vocab_v1.json
PAD_ID_DUAL = dual_vocab["<PAD>"]
UNK_ID_DUAL = dual_vocab["<UNK>"]

def encode_text_dual(text, max_len=48):
    tokens = tokenize(text)
    ids = [dual_vocab.get(w, UNK_ID_DUAL) for w in tokens][:max_len]
    if len(ids) < max_len:
        ids += [PAD_ID_DUAL] * (max_len - len(ids))
    return np.array(ids, dtype=np.int64)

# --- Create a separate vocab for cross encoder that includes <SEP> ---
cross_vocab = copy.deepcopy(dual_vocab)
if "<SEP>" not in cross_vocab:
    cross_vocab["<SEP>"] = len(cross_vocab)

PAD_ID = cross_vocab["<PAD>"]
UNK_ID = cross_vocab["<UNK>"]
SEP_ID = cross_vocab["<SEP>"]

def encode_pair_cross(query_text, deal_text, max_len=96):
    q_ids = [cross_vocab.get(w, UNK_ID) for w in tokenize(query_text)]
    d_ids = [cross_vocab.get(w, UNK_ID) for w in tokenize(deal_text)]

    q_max = int(max_len * 0.45)
    d_max = max_len - q_max - 1

    q_ids = q_ids[:q_max]
    d_ids = d_ids[:d_max]

    ids = q_ids + [SEP_ID] + d_ids
    if len(ids) < max_len:
        ids += [PAD_ID] * (max_len - len(ids))

    ids = np.array(ids, dtype=np.int64)
    ids = np.clip(ids, 0, len(cross_vocab)-1)  # safety
    return ids

In [5]:
# Keep labels 0–3 as a 4-class classification problem
train_df = interactions.sample(200000, random_state=7).copy()  # increase later if you want
print(train_df["relevance"].value_counts())

query_text_map = queries.set_index("query_id")["query_text"].to_dict()
deal_text_map = properties.set_index("property_id")["deal_text"].to_dict()

relevance
0    99888
2    37625
1    37436
3    25051
Name: count, dtype: int64


In [6]:
class CrossEncoderDataset(Dataset):
    def __init__(self, df, query_text_map, deal_text_map, max_len=96):
        self.df = df.reset_index(drop=True)
        self.qmap = query_text_map
        self.dmap = deal_text_map
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        qid = int(row["query_id"])
        pid = int(row["property_id"])
        y = int(row["relevance"])

        q_text = self.qmap[qid]
        d_text = self.dmap[pid]

        x = encode_pair_cross(q_text, d_text, max_len=self.max_len)
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

dataset = CrossEncoderDataset(train_df, query_text_map, deal_text_map, max_len=96)
loader = DataLoader(dataset, batch_size=256, shuffle=True)

In [7]:
class CrossEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden=128, pad_id=0):
        super().__init__()
        self.pad_id = pad_id
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 4)
        )

    def forward(self, token_ids):
        x = self.emb(token_ids)  # (B,L,D)
        mask = (token_ids != self.pad_id).float().unsqueeze(-1)
        pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)
        return self.mlp(pooled)

# IMPORTANT: vocab_size must match the encoder used in encode_pair_cross
model = CrossEncoder(vocab_size=len(cross_vocab), emb_dim=128, hidden=128, pad_id=PAD_ID)

In [8]:
# --- DEBUG: check token ID ranges coming from the DataLoader ---
batch_X, batch_y = next(iter(loader))
print("Batch X shape:", batch_X.shape)
print("Batch y shape:", batch_y.shape)
print("Min token id:", batch_X.min().item())
print("Max token id:", batch_X.max().item())

print("cross_vocab size:", len(cross_vocab))
print("PAD_ID:", PAD_ID, "UNK_ID:", UNK_ID, "SEP_ID:", SEP_ID)

assert batch_X.max().item() < len(cross_vocab), (
    f"Found token id {batch_X.max().item()} but cross_vocab size is {len(cross_vocab)}.\n"
    "Your dataset encoding vocab and model vocab_size are mismatched."
)

Batch X shape: torch.Size([256, 96])
Batch y shape: torch.Size([256])
Min token id: 0
Max token id: 18145
cross_vocab size: 18146
PAD_ID: 0 UNK_ID: 1 SEP_ID: 18145


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 3
model.train()

for epoch in range(EPOCHS):
    total_loss = 0.0

    for X, y in loader:
        logits = model(X)
        loss = criterion(logits, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} - loss={total_loss/len(loader):.4f}")

Epoch 1/3 - loss=1.2327
Epoch 2/3 - loss=1.2144
Epoch 3/3 - loss=1.1777


In [10]:
os.makedirs("../models/checkpoints", exist_ok=True)
torch.save(model.state_dict(), "../models/checkpoints/cross_encoder_v1.pt")
print("Saved cross encoder ✅")

Saved cross encoder ✅


In [11]:
# Normalize deal vectors (safety)
deal_vecs = deal_vecs / (np.linalg.norm(deal_vecs, axis=1, keepdims=True) + 1e-9)

import torch
import torch.nn as nn

class TextEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, pad_id=0):
        super().__init__()
        self.pad_id = pad_id
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)

    def forward(self, token_ids):
        x = self.emb(token_ids)  # (B,L,D)
        mask = (token_ids != self.pad_id).float().unsqueeze(-1)
        summed = (x * mask).sum(dim=1)
        denom = mask.sum(dim=1).clamp(min=1.0)
        return summed / denom

class DualEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, pad_id=0):
        super().__init__()
        self.query_encoder = TextEncoder(vocab_size, emb_dim, pad_id=pad_id)
        self.deal_encoder = TextEncoder(vocab_size, emb_dim, pad_id=pad_id)

dual = DualEncoder(vocab_size=len(dual_vocab), emb_dim=128, pad_id=PAD_ID_DUAL)

state = torch.load("../models/checkpoints/dual_encoder_v1.pt", map_location="cpu")
dual.load_state_dict(state, strict=True)
dual.eval()

MAX_LEN_DUAL = 48

def retrieve_top_n(prompt: str, top_n: int = 50):
    q_ids = torch.tensor(encode_text_dual(prompt, max_len=MAX_LEN_DUAL), dtype=torch.long).unsqueeze(0)
    with torch.no_grad():
        q_vec = dual.query_encoder(q_ids).cpu().numpy()

    q_vec = q_vec / (np.linalg.norm(q_vec, axis=1, keepdims=True) + 1e-9)
    sims = (deal_vecs @ q_vec.T).squeeze(1)
    idx = np.argsort(-sims)[:top_n]
    return idx, sims[idx]

print("Dual encoder loaded successfully ✅")

Dual encoder loaded successfully ✅


In [12]:
model.eval()
MAX_LEN_CROSS = 96

def rerank_with_cross(prompt: str, top_n: int = 50, top_k: int = 5):
    idx, sims = retrieve_top_n(prompt, top_n=top_n)

    # Build cross-encoder inputs
    batch = []
    for i in idx:
        d_text = properties.iloc[i]["deal_text"]
        batch.append(encode_pair_cross(prompt, d_text, max_len=MAX_LEN_CROSS))

    X = torch.tensor(np.stack(batch), dtype=torch.long)

    with torch.no_grad():
        logits = model(X)                  # (N,4)
        probs = torch.softmax(logits, dim=1).cpu().numpy()

    # Expected relevance = sum(class * prob)
    expected_rel = (probs * np.array([0,1,2,3], dtype=np.float32)).sum(axis=1)

    # Sort by expected relevance (descending)
    order = np.argsort(-expected_rel)[:top_k]
    final_idx = idx[order]

    out = properties.iloc[final_idx].copy()
    out["retrieval_sim"] = sims[order]
    out["rerank_score"] = expected_rel[order]

    cols = ["property_id","deal_type","city","state","beds","baths","sqft",
            "purchase_price","arv","entry_fee","estimated_monthly_payment",
            "retrieval_sim","rerank_score"]
    return out[cols]

In [13]:
prompt = "Looking for 3 bed deals under 350k, entry under 20k, payment under 2500 in Phoenix AZ"
rerank_with_cross(prompt, top_n=50, top_k=5)

Unnamed: 0,property_id,deal_type,city,state,beds,baths,sqft,purchase_price,arv,entry_fee,estimated_monthly_payment,retrieval_sim,rerank_score
34,35,DSCR Carryback,Tempe,AZ,3,2.5,1195,227163.0,254263.0,12881.0,1630.65,0.220366,2.0529
4783,4784,Seller Finance,Fairview,MO,5,3.5,2770,533648.0,836136.0,5737.0,4283.82,0.271495,1.781545
14470,14471,Seller Finance,Madison,OK,4,3.0,2835,195159.0,313156.0,8917.0,1866.68,0.22632,1.586047
8762,8763,Seller Finance,Jonesboro,GA,3,3.0,2076,136956.0,214401.0,27338.0,1032.35,0.209417,1.392262
10895,10896,Seller Finance,Memphis,TN,2,2.5,1724,225272.0,350105.0,20315.0,1765.02,0.210427,1.380082


In [None]:
def format_deals(df):
    lines = []
    for i, row in df.reset_index(drop=True).iterrows():
        lines.append(
            f"{i+1}) {row['deal_type']} | {row['beds']}bd/{row['baths']}ba | {row['city']}, {row['state']} | "
            f"Buy ${int(row['purchase_price']):,} | ARV ${int(row['arv']):,} | Entry ${int(row['entry_fee']):,} | "
            f"Pay ${float(row['estimated_monthly_payment']):,.0f} | Score {row['rerank_score']:.2f}"
        )
    return "\n".join(lines)

while True:
    user = input("\nYou: ").strip()
    if user.lower() in ["quit", "exit"]:
        print("Bot: Bye!")
        break

    deals = rerank_with_cross(user, top_n=50, top_k=5)
    print("\nBot: Here are the top 5 deals that match your criteria:\n")
    print(format_deals(deals))