In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
import torch
import os

In [2]:
# Load cleaned dataset
df = pd.read_csv('/kaggle/input/last-clean-dataset-re-ranking/cleaned_data.csv')

  df = pd.read_csv('/kaggle/input/last-clean-dataset-re-ranking/cleaned_data.csv')


In [3]:
# -------------------- STRUCTURED NUMERIC FEATURES --------------------
print("Computing structured numeric features...")
structured_features = pd.DataFrame()

# Cyclic encoding for month and day
structured_features['listing_month_sin'] = np.sin(2 * np.pi * df['listing_month'] / 12)
structured_features['listing_month_cos'] = np.cos(2 * np.pi * df['listing_month'] / 12)
structured_features['listing_day_sin'] = np.sin(2 * np.pi * df['listing_day'] / 31)
structured_features['listing_day_cos'] = np.cos(2 * np.pi * df['listing_day'] / 31)

# Multiple pricing option
structured_features['has_multiple_pricing_options'] = df['has_multiple_pricing_options'].astype(int)

# Price normalization
structured_features['price_normalized'] = np.log1p(df['unified_price'])

# Scale selected numerical columns
scaler = StandardScaler()
num_cols_to_scale = [
    'price_per_sqm',
    'description_length',
    'description_word_count',
    'description_sentence_count',
    'title_length',
    'title_word_count',
    'total_amenities',
    'location_latitude',
    'location_longitude',
    'construction_year',
    'building_size'
]
structured_features[num_cols_to_scale] = scaler.fit_transform(df[num_cols_to_scale])

# One-Hot Encoding for property type
property_type_ohe = pd.get_dummies(df['property_type'], prefix='type')
structured_features = pd.concat([structured_features, property_type_ohe], axis=1)

# Save structured features
structured_features.to_parquet('structured_features.parquet')

Computing structured numeric features...


In [4]:
# -------------------- TF-IDF FEATURES --------------------
print("Computing TF-IDF features...")
tfidf = TfidfVectorizer(max_features=500)
df_tfidf = df.dropna(subset=['description_clean'])
tfidf_matrix = tfidf.fit_transform(df_tfidf['description_clean'])  # DO NOT call toarray()
sparse.save_npz('tfidf_features.npz', tfidf_matrix)  # Save directly sparse

Computing TF-IDF features...


In [5]:
# -------------------- BERT EMBEDDINGS --------------------
print("Computing BERT embeddings...")
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
model = AutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
model.eval()

Computing BERT embeddings...


config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

2025-05-11 22:43:05.595928: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747003385.898598      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747003385.976355      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(100000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [6]:
# Folder to save batch embeddings
save_folder = "bert_batches"
os.makedirs(save_folder, exist_ok=True)

In [7]:
# Function to compute and save embeddings in batches
def compute_and_save_batches(texts, batch_size=64):
    batch_id = 0
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize the batch
            inputs = tokenizer(batch_texts.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=64)
            
            # Forward pass through the model
            outputs = model(**inputs)
            
            # Extract [CLS] embeddings
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
            # Save each batch as a separate .npy file
            np.save(os.path.join(save_folder, f"bert_batch_{batch_id}.npy"), cls_embeddings)
            batch_id += 1

# Compute and save BERT embeddings
compute_and_save_batches(df['description_clean'], batch_size=64)

In [8]:
# -------------------- MERGE ALL BATCHES --------------------
import glob

# Read all saved batch files
all_batches = []
for file in sorted(glob.glob(os.path.join(save_folder, "bert_batch_*.npy"))):
    batch = np.load(file)
    all_batches.append(batch)

# Stack all batches vertically
final_embeddings = np.vstack(all_batches)
print("Final shape:", final_embeddings.shape)

# Save the final merged embeddings
np.save("bert_embeddings_full.npy", final_embeddings)

Final shape: (714827, 768)
