# Import required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# DATA PREPROCESSING 

In [3]:
# Load Data
data = pd.read_csv('/kaggle/input/usa-real-estate-dataset/realtor-data.zip.csv')
data.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,


In [4]:
data.shape

(2226382, 12)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   brokered_by     float64
 1   status          object 
 2   price           float64
 3   bed             float64
 4   bath            float64
 5   acre_lot        float64
 6   street          float64
 7   city            object 
 8   state           object 
 9   zip_code        float64
 10  house_size      float64
 11  prev_sold_date  object 
dtypes: float64(8), object(4)
memory usage: 203.8+ MB


In [6]:
data.isnull().sum()

brokered_by         4533
status                 0
price               1541
bed               481317
bath              511771
acre_lot          325589
street             10866
city                1407
state                  8
zip_code             299
house_size        568484
prev_sold_date    734297
dtype: int64

# Preprocess Data

In [7]:
# Example data point
x = data.iloc[0]
print(f"brokeredby: {x['brokered_by']}. status: {x['status']}. Beds: {x['bed']}. Baths: {x['bath']}. acrelots: {x['acre_lot']}\n"
      f"streets: {x['street']}. cities: {x['city']}. states{x['state']}. Size: {x['house_size']} sqft. Zip: {x['zip_code']}\n"
      f"prev_sold_date: {x['prev_sold_date']}. Price:")

brokeredby: 103378.0. status: for_sale. Beds: 3.0. Baths: 2.0. acrelots: 0.12
streets: 1962661.0. cities: Adjuntas. statesPuerto Rico. Size: 920.0 sqft. Zip: 601.0
prev_sold_date: nan. Price:


In [8]:
from datasets import Dataset

# Preprocessing convert dataset to text 
df = data.dropna(subset=["price", "brokered_by", "status", "bed", "bath", "acre_lot", "street", "city", "state", 
                       "house_size", "zip_code", "prev_sold_date"])
df["text"] = df.apply(
    lambda x: f"brokeredby: {x['brokered_by']}. status: {x['status']}. Beds: {x['bed']}. Baths: {x['bath']}\n"
              f"acrelots: {x['acre_lot']}. streets: {x['street']}. cities: {x['city']}. states{x['state']}\n"
              f"Size: {x['house_size']} sqft. Zip: {x['zip_code']}. prev_sold_date: {x['prev_sold_date']}. Price:", axis = 1)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df[["text", "price"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df.apply(


In [12]:
# ex
print("\nDataset sample:", dataset[0])


Dataset sample: {'text': 'brokeredby: 92147.0. status: for_sale. Beds: 7.0. Baths: 3.0\nacrelots: 0.09. streets: 1842706.0. cities: Dorado. statesPuerto Rico\nSize: 1192.0 sqft. Zip: 949.0. prev_sold_date: 2019-06-28. Price:', 'price': 110000.0, '__index_level_0__': 502}


# MODEL SETUP

# Configure LoRA

In [15]:
# Install required packages || Lora
!pip install torch transformers peft accelerate datasets



In [16]:
# unsloth
!pip install unsloth



In [None]:
import torch
from unsloth import FastLanguageModel

In [20]:
# Load 4-bit quantized base model
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/llama-3-8b-bnb-4bit",
    load_in_4bit=True,  # 4-bit quantization for memory efficiency
    device_map="auto",  # Automatic GPU/CPU allocation
)

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.1.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [21]:
# Config LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # LoRA rank (tradeoff: higher = more adaptable but larger)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],     # Layers to adapt
    lora_alpha = 32,          # Scaling factor
    lora_dropout = 0.05,      # Regularization
    bias = "none",            # No bias terms
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.4.7 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# TRAINING CONFIGURATION

In [27]:
from transformers import TrainingArguments, Trainer

train_args = TrainingArguments(
    per_device_train_batch_size = 10000,    # Adjusted for GPU memory
    gradient_accumulation_steps = 4,        # Effective batch size = 8 * 4 = 32
    num_train_epochs = 3,
    learning_rate = 2e-5,                   # Optimal for fine-tuning
    fp16 = True,                            # Mixed precision training
    logging_steps = 10,                     # Log every 10 steps
    output_dir = "outputs",                 # Save checkpoints
)

In [None]:
# Tokenize function
def tokenize(x):
    return tokenizer(
        x["text"],
        truncation = True,        # Cut long sequences
        max_length = 512)         # Standard LLM input size

# Apply tokenization in batches
dataset = dataset.map(
    tokenize,
    batched = True,
    batch_size = 10000)           # Process 1000 samples at once

# Train
trainer = Trainer(model,
                  train_dataset = dataset,
                  args = train_args)


In [None]:
# Start fine-tuning
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,084,909 | Num Epochs = 3 | Total steps = 81
O^O/ \_/ \    Batch size per device = 10,000 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (10000 x 4 x 1) = 40,000
 "-____-"     Trainable parameters = 13,631,488/8,000,000,000 (0.17% trained)


<IPython.core.display.Javascript object>

# MODEL SAVING

In [None]:
model.save_pretrained_merged(
    "realtor-llm",
    tokenizer,
    save_method = "merged_16bit")       # Combine LoRA with base model

# INFERENCE SETUP

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(
    model = "realtor-llm",
    quantization="awq")      # 4-bit inference optimization

def predict_price(listing_text):
    """Predict price for a property listing"""
    sampling_params = SamplingParams(
        temperature = 0.1,   #Low for deterministic output
        max_tokens = 50)     # Enough for price prediction
    outputs = llm.generate([listing_text], sampling_params)
    return outputs[0].text.split("Price:")[-1].strip()