In [1]:
import os, re, json, copy
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

In [2]:
DATA_PATH = "../data/nexkey_synthetic_dataset_v1"
CKPT_PATH = "../models/checkpoints"

queries = pd.read_csv(f"{DATA_PATH}/queries.csv")
properties = pd.read_csv(f"{DATA_PATH}/properties.csv")

# Build deal_text (must match training)
def property_to_text(row):
    return (
        f"{row['deal_type']} {row['property_type']} in {row['city']} {row['state']}. "
        f"{int(row['beds'])} bed {row['baths']} bath, {int(row['sqft'])} sqft. "
        f"Purchase {int(row['purchase_price'])}, ARV {int(row['arv'])}, "
        f"Entry {int(row['entry_fee'])}, Payment {row['estimated_monthly_payment']}. "
        f"Condition {row['condition']}, Occupancy {row['occupancy']}."
    )

properties["deal_text"] = properties.apply(property_to_text, axis=1)

# Load dual vocab + deal vecs
with open(f"{CKPT_PATH}/dual_vocab_v1.json", "r") as f:
    dual_vocab = json.load(f)

deal_vecs = np.load(f"{CKPT_PATH}/deal_vecs_v1.npy")
deal_vecs = deal_vecs / (np.linalg.norm(deal_vecs, axis=1, keepdims=True) + 1e-9)

print("Loaded ✅")
print("properties:", properties.shape)
print("deal_vecs :", deal_vecs.shape)
print("dual_vocab:", len(dual_vocab))

Loaded ✅
properties: (15000, 28)
deal_vecs : (15000, 128)
dual_vocab: 18145


In [3]:
# If you saved cross_vocab_v1.json in notebook 08, load it:
cross_vocab_path = f"{CKPT_PATH}/cross_vocab_v1.json"

if os.path.exists(cross_vocab_path):
    with open(cross_vocab_path, "r") as f:
        cross_vocab = json.load(f)
else:
    # fallback: derive from dual vocab
    cross_vocab = copy.deepcopy(dual_vocab)
    if "<SEP>" not in cross_vocab:
        cross_vocab["<SEP>"] = len(cross_vocab)

PAD_ID_DUAL = dual_vocab["<PAD>"]
UNK_ID_DUAL = dual_vocab["<UNK>"]

PAD_ID = cross_vocab["<PAD>"]
UNK_ID = cross_vocab["<UNK>"]
SEP_ID = cross_vocab["<SEP>"]

print("cross_vocab:", len(cross_vocab), "| PAD/UNK/SEP:", PAD_ID, UNK_ID, SEP_ID)

cross_vocab: 18146 | PAD/UNK/SEP: 0 1 18145


In [4]:
def tokenize(text: str):
    return re.findall(r"[a-z0-9]+", str(text).lower())

# Dual encoder query encoding (must match dual training exactly)
def encode_text_dual(text: str, max_len: int = 48):
    ids = [dual_vocab.get(w, UNK_ID_DUAL) for w in tokenize(text)][:max_len]
    if len(ids) < max_len:
        ids += [PAD_ID_DUAL] * (max_len - len(ids))
    return np.array(ids, dtype=np.int64)

# Cross encoder pair encoding (must match cross training)
def encode_pair_cross(query_text: str, deal_text: str, max_len: int = 96):
    q_ids = [cross_vocab.get(w, UNK_ID) for w in tokenize(query_text)]
    d_ids = [cross_vocab.get(w, UNK_ID) for w in tokenize(deal_text)]

    q_max = int(max_len * 0.45)
    d_max = max_len - q_max - 1

    q_ids = q_ids[:q_max]
    d_ids = d_ids[:d_max]

    ids = q_ids + [SEP_ID] + d_ids
    if len(ids) < max_len:
        ids += [PAD_ID] * (max_len - len(ids))

    ids = np.array(ids, dtype=np.int64)
    ids = np.clip(ids, 0, len(cross_vocab) - 1)
    return ids

In [5]:
class TextEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, pad_id=0):
        super().__init__()
        self.pad_id = pad_id
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)

    def forward(self, token_ids):
        x = self.emb(token_ids)
        mask = (token_ids != self.pad_id).float().unsqueeze(-1)
        summed = (x * mask).sum(dim=1)
        denom = mask.sum(dim=1).clamp(min=1.0)
        return summed / denom

class DualEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, pad_id=0):
        super().__init__()
        self.query_encoder = TextEncoder(vocab_size, emb_dim, pad_id=pad_id)
        self.deal_encoder  = TextEncoder(vocab_size, emb_dim, pad_id=pad_id)

dual = DualEncoder(vocab_size=len(dual_vocab), emb_dim=128, pad_id=PAD_ID_DUAL)
dual.load_state_dict(torch.load(f"{CKPT_PATH}/dual_encoder_v1.pt", map_location="cpu"))
dual.eval()

print("Dual encoder loaded ✅")

Dual encoder loaded ✅


In [6]:
class CrossEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden=128, pad_id=0):
        super().__init__()
        self.pad_id = pad_id
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 4)
        )

    def forward(self, token_ids):
        x = self.emb(token_ids)
        mask = (token_ids != self.pad_id).float().unsqueeze(-1)
        pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)
        return self.mlp(pooled)

cross = CrossEncoder(vocab_size=len(cross_vocab), emb_dim=128, hidden=128, pad_id=PAD_ID)
cross.load_state_dict(torch.load(f"{CKPT_PATH}/cross_encoder_best.pt", map_location="cpu"))
cross.eval()

print("Cross encoder loaded ✅")

Cross encoder loaded ✅


In [7]:
MAX_LEN_DUAL = 48
MAX_LEN_CROSS = 96

def retrieve_top_n(prompt: str, top_n: int = 50):
    q_ids = torch.tensor(encode_text_dual(prompt, max_len=MAX_LEN_DUAL), dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        q_vec = dual.query_encoder(q_ids).cpu().numpy()

    q_vec = q_vec / (np.linalg.norm(q_vec, axis=1, keepdims=True) + 1e-9)
    sims = (deal_vecs @ q_vec.T).squeeze(1)
    idx = np.argsort(-sims)[:top_n]
    return idx, sims[idx]

def rerank_with_cross(prompt: str, top_n: int = 50, top_k: int = 5):
    idx, sims = retrieve_top_n(prompt, top_n=top_n)

    # Build cross inputs
    batch = []
    for i in idx:
        d_text = properties.iloc[i]["deal_text"]
        batch.append(encode_pair_cross(prompt, d_text, max_len=MAX_LEN_CROSS))

    X = torch.tensor(np.stack(batch), dtype=torch.long)

    with torch.no_grad():
        logits = cross(X)
        probs = torch.softmax(logits, dim=1).cpu().numpy()

    expected_rel = (probs * np.array([0,1,2,3], dtype=np.float32)).sum(axis=1)

    order = np.argsort(-expected_rel)[:top_k]
    final_idx = idx[order]

    out = properties.iloc[final_idx].copy()
    out["retrieval_sim"] = sims[order]
    out["rerank_score"] = expected_rel[order]

    cols = [
        "property_id","deal_type","city","state","beds","baths","sqft",
        "purchase_price","arv","entry_fee","estimated_monthly_payment",
        "retrieval_sim","rerank_score"
    ]
    return out[cols]

In [8]:
def detect_missing_criteria(text: str):
    t = text.lower()

    has_beds = re.search(r"\d+\s*\+?\s*bed", t) is not None
    has_price = re.search(r"(under|max|<=)\s*\$?\s*[\d\.,]+[km]?", t) is not None or "$" in t
    has_location = any(k in t for k in [
        "az","arizona","tx","texas","fl","florida","ga","georgia","nc","north carolina",
        "sc","south carolina","tn","tennessee","ca","california"
    ])

    missing = []
    if not has_location: missing.append("location (city/state)")
    if not has_beds: missing.append("bedrooms (e.g., 3 bed)")
    if not has_price: missing.append("max purchase price (e.g., under 350k)")
    return missing

def format_deals(df):
    lines = []
    for i, row in df.reset_index(drop=True).iterrows():
        lines.append(
            f"{i+1}) {row['deal_type']} | {row['beds']}bd/{row['baths']}ba | {row['city']}, {row['state']} | "
            f"Buy ${int(row['purchase_price']):,} | ARV ${int(row['arv']):,} | "
            f"Entry ${int(row['entry_fee']):,} | Pay ${float(row['estimated_monthly_payment']):,.0f} "
            f"| Score {row['rerank_score']:.2f}"
        )
    return "\n".join(lines)

def chat_reply(user_message: str):
    missing = detect_missing_criteria(user_message)

    # If too vague: ask clarifying question
    if len(missing) >= 2:
        return (
            "To find the best deals, I need a bit more detail.\n"
            f"Can you share: {', '.join(missing)}?\n"
            "Example: “3 bed in AZ under 350k, entry under 20k, payment under 2500”"
        )

    # Otherwise attempt retrieval + rerank
    deals = rerank_with_cross(user_message, top_n=50, top_k=5)

    if deals is None or len(deals) == 0:
        return (
            "I couldn’t find any deals that match those criteria.\n"
            "Try loosening one constraint:\n"
            "- increase max price (e.g., +50k)\n"
            "- increase entry limit (e.g., +10k)\n"
            "- allow higher payment (e.g., +300/month)\n"
            "- broaden location (state instead of city)"
        )

    return "Here are the top 5 deals I found:\n\n" + format_deals(deals)

In [None]:
print("NexKey DealMatch Chatbot (type 'quit' to exit)")

while True:
    msg = input("\nYou: ").strip()
    if msg.lower() in ["quit", "exit"]:
        print("Bot: Bye!")
        break

    print("\nBot:", chat_reply(msg))

NexKey DealMatch Chatbot (type 'quit' to exit)
