In [1]:
import os
import re
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
DATA_PATH = "../data/nexkey_synthetic_dataset_v1"
queries = pd.read_csv(f"{DATA_PATH}/queries.csv")
properties = pd.read_csv(f"{DATA_PATH}/properties.csv")
interactions = pd.read_csv(f"{DATA_PATH}/interactions.csv")

print(queries.shape, properties.shape, interactions.shape)

(30000, 16) (15000, 27) (480000, 4)


In [3]:
def property_to_text(row):
    return (
        f"{row['deal_type']} {row['property_type']} in {row['city']} {row['state']}. "
        f"{int(row['beds'])} bed {row['baths']} bath, {int(row['sqft'])} sqft. "
        f"Purchase {int(row['purchase_price'])}, ARV {int(row['arv'])}, "
        f"Entry {int(row['entry_fee'])}, Payment {row['estimated_monthly_payment']}. "
        f"Condition {row['condition']}, Occupancy {row['occupancy']}."
    )

properties["deal_text"] = properties.apply(property_to_text, axis=1)
properties[["property_id", "deal_text"]].head()

Unnamed: 0,property_id,deal_text
0,1,Subto Single Family in Raleigh NC. 4 bed 1.0 b...
1,2,Hybrid Single Family in Sacramento CA. 5 bed 1...
2,3,"Cash Condo in Charleston SC. 4 bed 2.5 bath, 3..."
3,4,Subto Manufactured in Greenville AL. 4 bed 2.0...
4,5,Seller Finance Single Family in Fairview AL. 3...


In [4]:
# Use only strong positives
pos = interactions[interactions["relevance"] >= 2][["query_id", "property_id"]].copy()

# For speed while learning, sample a subset (increase later)
pos = pos.sample(120000, random_state=7)

# Build a quick lookup: query_id -> set of positive property_ids
pos_map = pos.groupby("query_id")["property_id"].apply(set).to_dict()

all_property_ids = properties["property_id"].values

In [5]:
def tokenize(text):
    # lowercase + keep words/numbers
    return re.findall(r"[a-z0-9]+", text.lower())

# Build vocab from a sample for speed
VOCAB_SIZE = 30000
word_counts = {}

for t in pd.concat([queries["query_text"].sample(5000, random_state=7),
                    properties["deal_text"].sample(5000, random_state=7)]):
    for w in tokenize(t):
        word_counts[w] = word_counts.get(w, 0) + 1

# Keep most common words
vocab = {"<PAD>": 0, "<UNK>": 1}
for w, _ in sorted(word_counts.items(), key=lambda x: -x[1])[:VOCAB_SIZE-2]:
    vocab[w] = len(vocab)

def encode(text, max_len=48):
    tokens = tokenize(text)
    ids = [vocab.get(w, vocab["<UNK>"]) for w in tokens][:max_len]
    if len(ids) < max_len:
        ids += [vocab["<PAD>"]] * (max_len - len(ids))
    return np.array(ids, dtype=np.int64)

print("Vocab size:", len(vocab))

Vocab size: 18145


In [6]:
class PairDataset(Dataset):
    def __init__(self, pos_df, queries_df, properties_df, pos_map, all_property_ids, max_len=48):
        self.pos_df = pos_df.reset_index(drop=True)
        self.queries = queries_df.set_index("query_id")
        self.props = properties_df.set_index("property_id")
        self.pos_map = pos_map
        self.all_pids = all_property_ids
        self.max_len = max_len

    def __len__(self):
        return len(self.pos_df)

    def __getitem__(self, idx):
        qid = int(self.pos_df.loc[idx, "query_id"])
        pos_pid = int(self.pos_df.loc[idx, "property_id"])

        q_text = self.queries.loc[qid, "query_text"]
        pos_text = self.props.loc[pos_pid, "deal_text"]

        # Sample a negative property not in positives
        positives = self.pos_map.get(qid, set())
        neg_pid = int(np.random.choice(self.all_pids))
        while neg_pid in positives:
            neg_pid = int(np.random.choice(self.all_pids))
        neg_text = self.props.loc[neg_pid, "deal_text"]

        q_ids = encode(q_text, self.max_len)
        pos_ids = encode(pos_text, self.max_len)
        neg_ids = encode(neg_text, self.max_len)

        return (
            torch.tensor(q_ids),
            torch.tensor(pos_ids),
            torch.tensor(neg_ids),
        )

dataset = PairDataset(pos, queries, properties, pos_map, all_property_ids)
loader = DataLoader(dataset, batch_size=256, shuffle=True)

In [7]:
class TextEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)

    def forward(self, token_ids):
        # token_ids: (B, L)
        x = self.emb(token_ids)  # (B, L, D)

        # mean pooling ignoring PAD
        mask = (token_ids != 0).float().unsqueeze(-1)  # (B, L, 1)
        summed = (x * mask).sum(dim=1)                 # (B, D)
        denom = mask.sum(dim=1).clamp(min=1.0)         # (B, 1)
        return summed / denom

class DualEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128):
        super().__init__()
        self.query_encoder = TextEncoder(vocab_size, emb_dim)
        self.deal_encoder = TextEncoder(vocab_size, emb_dim)

    def forward(self, q, pos, neg):
        q_vec = self.query_encoder(q)
        pos_vec = self.deal_encoder(pos)
        neg_vec = self.deal_encoder(neg)
        return q_vec, pos_vec, neg_vec

model = DualEncoder(vocab_size=len(vocab), emb_dim=128)

In [8]:
def cosine_sim(a, b):
    a = nn.functional.normalize(a, dim=1)
    b = nn.functional.normalize(b, dim=1)
    return (a * b).sum(dim=1)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 3
model.train()

for epoch in range(EPOCHS):
    total_loss = 0.0

    for q_ids, pos_ids, neg_ids in loader:
        q_vec, pos_vec, neg_vec = model(q_ids, pos_ids, neg_ids)

        pos_sim = cosine_sim(q_vec, pos_vec)
        neg_sim = cosine_sim(q_vec, neg_vec)

        # Margin ranking loss: pos should be > neg by margin
        loss = torch.relu(0.2 - (pos_sim - neg_sim)).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, loss={total_loss/len(loader):.4f}")

Epoch 1/3, loss=0.1756
Epoch 2/3, loss=0.1266
Epoch 3/3, loss=0.0782


In [9]:
import json

os.makedirs("../models/checkpoints", exist_ok=True)

# Save model weights
torch.save(model.state_dict(), "../models/checkpoints/dual_encoder_v1.pt")

# Save vocab so encoding is consistent later
with open("../models/checkpoints/dual_vocab_v1.json", "w") as f:
    json.dump(vocab, f)

print("Saved model + vocab ✅")

Saved model + vocab ✅


In [10]:
model.eval()

DualEncoder(
  (query_encoder): TextEncoder(
    (emb): Embedding(18145, 128, padding_idx=0)
  )
  (deal_encoder): TextEncoder(
    (emb): Embedding(18145, 128, padding_idx=0)
  )
)

In [11]:
MAX_LEN = 48

# Encode all deal texts into token IDs
deal_token_ids = np.stack([encode(t, MAX_LEN) for t in properties["deal_text"].values])
deal_token_ids = torch.tensor(deal_token_ids, dtype=torch.long)

print("Deal token matrix:", deal_token_ids.shape)

with torch.no_grad():
    deal_vecs = model.deal_encoder(deal_token_ids).cpu().numpy()

# Normalize for cosine similarity
deal_vecs = deal_vecs / (np.linalg.norm(deal_vecs, axis=1, keepdims=True) + 1e-9)

print("Deal embeddings:", deal_vecs.shape)

Deal token matrix: torch.Size([15000, 48])
Deal embeddings: (15000, 128)


In [12]:
np.save("../models/checkpoints/deal_vecs_v1.npy", deal_vecs)
properties[["property_id"]].to_csv("../models/checkpoints/deal_ids_v1.csv", index=False)

print("Saved deal embeddings ✅")

Saved deal embeddings ✅


In [13]:
def retrieve_top_k(prompt: str, top_k: int = 5):
    # 1) Encode query
    q_ids = torch.tensor(encode(prompt, MAX_LEN), dtype=torch.long).unsqueeze(0)  # (1, L)

    # 2) Embed query
    with torch.no_grad():
        q_vec = model.query_encoder(q_ids).cpu().numpy()  # (1, D)

    # 3) Normalize
    q_vec = q_vec / (np.linalg.norm(q_vec, axis=1, keepdims=True) + 1e-9)

    # 4) Cosine similarity = dot product of normalized vectors
    sims = (deal_vecs @ q_vec.T).squeeze(1)  # (num_deals,)

    # 5) Get top indices
    top_idx = np.argsort(-sims)[:top_k]

    # 6) Return top deals with similarity score
    result = properties.iloc[top_idx].copy()
    result["similarity"] = sims[top_idx]
    cols = ["property_id", "deal_type", "city", "state", "beds", "baths", "sqft",
            "purchase_price", "arv", "entry_fee", "estimated_monthly_payment", "similarity"]
    return result[cols]

In [14]:
user_prompt = "Looking for 3 bed deals under 350k, entry under 20k, payment under 2500 in Phoenix AZ"
retrieve_top_k(user_prompt, top_k=5)

Unnamed: 0,property_id,deal_type,city,state,beds,baths,sqft,purchase_price,arv,entry_fee,estimated_monthly_payment,similarity
4783,4784,Seller Finance,Fairview,MO,5,3.5,2770,533648.0,836136.0,5737.0,4283.82,0.271495
11510,11511,Subto,Tempe,AZ,2,4.0,2332,213698.0,375438.0,6652.0,2180.93,0.270117
14910,14911,Seller Finance,Greenville,UT,3,2.0,2304,327540.0,556610.0,23087.0,2764.35,0.268693
5098,5099,Wrap,Greenville,OK,2,2.5,2393,244459.0,398621.0,12619.0,1627.17,0.255297
8711,8712,Subto,Tempe,AZ,4,2.0,2378,443778.0,532760.0,8586.0,4416.39,0.255227


In [15]:
# Build ground truth: for each query, which property_ids are relevant (>=2)
gt = interactions[interactions["relevance"] >= 2].groupby("query_id")["property_id"].apply(set).to_dict()

# Sample queries for evaluation (speed)
eval_query_ids = list(gt.keys())
eval_query_ids = np.random.choice(eval_query_ids, size=500, replace=False)

print("Eval queries:", len(eval_query_ids))

Eval queries: 500


In [16]:
def recall_at_k(recommended_pids, relevant_pids, k):
    rec_k = set(recommended_pids[:k])
    return 1.0 if len(rec_k.intersection(relevant_pids)) > 0 else 0.0

In [17]:
Ks = [1, 5, 10]
recalls = {k: [] for k in Ks}

for qid in eval_query_ids:
    prompt = queries.loc[queries["query_id"] == qid, "query_text"].iloc[0]
    relevant = gt[qid]

    # retrieve top 10
    rec_df = retrieve_top_k(prompt, top_k=10)
    rec_pids = rec_df["property_id"].tolist()

    for k in Ks:
        recalls[k].append(recall_at_k(rec_pids, relevant, k))

for k in Ks:
    print(f"Recall@{k}: {np.mean(recalls[k]):.4f}")

Recall@1: 0.0180
Recall@5: 0.0880
Recall@10: 0.1580
