In [1]:
import requests
import csv
import time

In [2]:
# Replace this with your actual RapidAPI key
RAPIDAPI_KEY = "5ebe3ec596msh101df8d21507439p194480jsnd2f591cb4f99"

headers = {
    "content-type": "application/json",
    "X-RapidAPI-Key": RAPIDAPI_KEY,
    "X-RapidAPI-Host": "us-realtor.p.rapidapi.com"
}

In [9]:
def fetch_property_list(city, state_code, area_type, limit, offset):
    params = {
        "location": city,
        "state_code": state_code,
        "area_type": area_type,
        "limit": str(limit),
        "offset": str(offset)
    }
    response = requests.get("https://us-realtor.p.rapidapi.com/api/v1/property/list", headers=headers, params=params)
    
    try:
        data = response.json()
        listings = data.get("data", {}).get("home_search", {}).get("results", [])
        return listings
    except Exception as e:
        print("❌ Error fetching listings:", e)
        return []

In [10]:
def fetch_property_details(property_id, listing_id):
    try:
        url = f"https://us-realtor.p.rapidapi.com/api/v1/property/data-sources?property_id={property_id}&listing_id={listing_id}"
        response = requests.get(url, headers=headers)
        data = response.json().get("data", {}).get("home", {})
        description = data.get("description", {}).get("text", "")
        return description
    except Exception as e:
        print(f"⚠️ Error fetching details for {property_id}: {e}")
        return ""

In [11]:
def gather_data(city, state_code, area_type, output_csv, limit_per_page=50, max_pages=1000):
    all_listings = []
    for page in range(max_pages):
        offset = page * limit_per_page
        print(f"📄 Fetching page {page+1} (offset: {offset}) for {city}...")
        listings = fetch_property_list(city, state_code, area_type, limit_per_page, offset)

        if not listings:
            print("🚫 No more listings found.")
            break

        all_listings.extend(listings)
        time.sleep(1)  # Be nice to the API

    print(f"✅ Total listings fetched: {len(all_listings)}")
    save_to_csv(all_listings, output_csv)


In [12]:
def save_to_csv(listings, output_csv):
    with open(output_csv, mode='w', newline='', encoding='utf-8') as f:
        fieldnames = [
            "property_id", "listing_id", "price", "beds", "baths", "sqft", "address", 
            "city", "state", "postal_code", "description"
        ]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for listing in listings:
            try:
                prop_id = listing.get("property_id")
                listing_id = listing.get("listing_id")
                price = listing.get("list_price")
                desc_info = listing.get("description") or {}
                beds = desc_info.get("beds")
                baths = desc_info.get("baths")
                sqft = desc_info.get("sqft")

                loc_info = listing.get("location") or {}
                addr = loc_info.get("address") or {}
                address = f"{addr.get('line', '')}, {addr.get('city', '')}, {addr.get('state_code', '')} {addr.get('postal_code', '')}"
                city = addr.get("city")
                state = addr.get("state_code")
                postal_code = addr.get("postal_code")

                description = fetch_property_details(prop_id, listing_id)

                writer.writerow({
                    "property_id": prop_id,
                    "listing_id": listing_id,
                    "price": price,
                    "beds": beds,
                    "baths": baths,
                    "sqft": sqft,
                    "address": address,
                    "city": city,
                    "state": state,
                    "postal_code": postal_code,
                    "description": description
                })

                time.sleep(0.5)
            except Exception as e:
                print(f"⚠️ Skipped a listing due to error: {e}")

    print(f"💾 Data saved to {output_csv}")

In [13]:
# 🚀 Start the data collection for New Jersey
gather_data(
    city="New Jersey",
    state_code="NJ",
    area_type="state",
    output_csv="nj_property_listings.csv",
    max_pages=200,  # Adjust this number as per your needs
    limit_per_page=50  # Listings per page
)

📄 Fetching page 1 (offset: 0) for New Jersey...
📄 Fetching page 2 (offset: 50) for New Jersey...
📄 Fetching page 3 (offset: 100) for New Jersey...
📄 Fetching page 4 (offset: 150) for New Jersey...
📄 Fetching page 5 (offset: 200) for New Jersey...
📄 Fetching page 6 (offset: 250) for New Jersey...
📄 Fetching page 7 (offset: 300) for New Jersey...
📄 Fetching page 8 (offset: 350) for New Jersey...
📄 Fetching page 9 (offset: 400) for New Jersey...
📄 Fetching page 10 (offset: 450) for New Jersey...
📄 Fetching page 11 (offset: 500) for New Jersey...
📄 Fetching page 12 (offset: 550) for New Jersey...
📄 Fetching page 13 (offset: 600) for New Jersey...
📄 Fetching page 14 (offset: 650) for New Jersey...
📄 Fetching page 15 (offset: 700) for New Jersey...
📄 Fetching page 16 (offset: 750) for New Jersey...
📄 Fetching page 17 (offset: 800) for New Jersey...
📄 Fetching page 18 (offset: 850) for New Jersey...
📄 Fetching page 19 (offset: 900) for New Jersey...
📄 Fetching page 20 (offset: 950) for New Je

In [146]:
import pandas as pd

In [147]:
df = pd.read_csv("nj_property_listings.csv")

## 2. Data Preprocessing

### Structured Data Processing

In [148]:
df.head()

Unnamed: 0,property_id,listing_id,price,beds,baths,sqft,address,city,state,postal_code,description
0,9905345213,2969965000.0,6310000,9.0,5.0,,"210 24th Ave, Seaside Park, NJ 08752",Seaside Park,NJ,8752,A rare opportunity is yours to own a fabulous ...
1,5806262520,2978911000.0,9900000,7.0,11.0,6500.0,"767 East Ave, Bay Head, NJ 08742",Bay Head,NJ,8742,"The residence at 767 East Ave, Bay Head combin..."
2,9752787852,2960833000.0,1350000,3.0,4.0,3347.0,"142 Eagle Way, Eatontown, NJ 07724",Eatontown,NJ,7724,The Ridge at Suneagles Condominium offers 60 l...
3,9970508100,2978278000.0,1450000,3.0,4.0,3347.0,"174 Eagle Way, Eatontown, NJ 07724",Eatontown,NJ,7724,The Ridge at Suneagles Condominium offers 60 l...
4,9071170670,2970837000.0,1250000,3.0,4.0,3347.0,"120 Eagle Way, Eatontown, NJ 07724",Eatontown,NJ,7724,The Ridge at Suneagles Condominium offers 60 l...


In [149]:
df.shape

(10000, 11)

In [150]:
df.isnull().sum()

property_id       0
listing_id       23
price             0
beds             71
baths            64
sqft           3292
address           0
city              1
state             0
postal_code       0
description     122
dtype: int64

1. Handling Missing Values

In [151]:
# Drop rows where 'description' is missing (NaN)
df = df.dropna(subset=['description'])


In [152]:
df.isnull().sum()

property_id       0
listing_id        0
price             0
beds             70
baths            41
sqft           3236
address           0
city              1
state             0
postal_code       0
description       0
dtype: int64

In [153]:
# Fill numeric columns with median
df['beds'].fillna(df['beds'].median(), inplace=True)
df['baths'].fillna(df['baths'].median(), inplace=True)
df['sqft'].fillna(df['sqft'].median(), inplace=True)

# Fill 'city' with mode
df['city'].fillna(df['city'].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['beds'].fillna(df['beds'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['baths'].fillna(df['baths'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

In [154]:
# Check if all missing values are handled
print(df.isnull().sum())

property_id    0
listing_id     0
price          0
beds           0
baths          0
sqft           0
address        0
city           0
state          0
postal_code    0
description    0
dtype: int64


In [155]:
df.shape

(9878, 11)

2.1 Normalize Numerical Features

In [156]:
#Columns to normalize: price, beds, baths, sqft

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_cols = ['price', 'beds', 'baths', 'sqft']

df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [157]:
df.head()

Unnamed: 0,property_id,listing_id,price,beds,baths,sqft,address,city,state,postal_code,description
0,9905345213,2969965000.0,0.22411,0.346154,0.025,0.044713,"210 24th Ave, Seaside Park, NJ 08752",Seaside Park,NJ,8752,A rare opportunity is yours to own a fabulous ...
1,5806262520,2978911000.0,0.352531,0.269231,0.055,0.141427,"767 East Ave, Bay Head, NJ 08742",Bay Head,NJ,8742,"The residence at 767 East Ave, Bay Head combin..."
2,9752787852,2960833000.0,0.046682,0.115385,0.02,0.070511,"142 Eagle Way, Eatontown, NJ 07724",Eatontown,NJ,7724,The Ridge at Suneagles Condominium offers 60 l...
3,9970508100,2978278000.0,0.050259,0.115385,0.02,0.070511,"174 Eagle Way, Eatontown, NJ 07724",Eatontown,NJ,7724,The Ridge at Suneagles Condominium offers 60 l...
4,9071170670,2970837000.0,0.043105,0.115385,0.02,0.070511,"120 Eagle Way, Eatontown, NJ 07724",Eatontown,NJ,7724,The Ridge at Suneagles Condominium offers 60 l...


3. Encode Categorical Variables

In [158]:
#Columns to encode - city, state and postal code
from sklearn.preprocessing import LabelEncoder

le_city = LabelEncoder()
df['city'] = le_city.fit_transform(df['city'])

le_state = LabelEncoder()
df['state'] = le_state.fit_transform(df['state'])

le_zip = LabelEncoder()
df['postal_code'] = le_zip.fit_transform(df['postal_code'])

In [159]:
df.head()

Unnamed: 0,property_id,listing_id,price,beds,baths,sqft,address,city,state,postal_code,description
0,9905345213,2969965000.0,0.22411,0.346154,0.025,0.044713,"210 24th Ave, Seaside Park, NJ 08752",505,0,468,A rare opportunity is yours to own a fabulous ...
1,5806262520,2978911000.0,0.352531,0.269231,0.055,0.141427,"767 East Ave, Bay Head, NJ 08742",24,0,465,"The residence at 767 East Ave, Bay Head combin..."
2,9752787852,2960833000.0,0.046682,0.115385,0.02,0.070511,"142 Eagle Way, Eatontown, NJ 07724",143,0,193,The Ridge at Suneagles Condominium offers 60 l...
3,9970508100,2978278000.0,0.050259,0.115385,0.02,0.070511,"174 Eagle Way, Eatontown, NJ 07724",143,0,193,The Ridge at Suneagles Condominium offers 60 l...
4,9071170670,2970837000.0,0.043105,0.115385,0.02,0.070511,"120 Eagle Way, Eatontown, NJ 07724",143,0,193,The Ridge at Suneagles Condominium offers 60 l...


### Text Data Cleaning

In [160]:
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

In [161]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ 
    elif pos_tag.startswith('V'):
        return wordnet.VERB 
    elif pos_tag.startswith('N'):
        return wordnet.NOUN 
    elif pos_tag.startswith('R'):
        return wordnet.ADV 
    else:
        return wordnet.NOUN 

In [162]:
def tokenize(text, lowercase=True, remove_stopwords=False, normalize=None,
             stopwords=stopwords.words('english')):

    if isinstance(text, str):
        text = [text]   

    big_list_of_tokens = []

    for doc in text:
        if lowercase:
            doc = doc.lower()

        tokens = word_tokenize(doc)
        
        tokens = [w for word in tokens for w in word.strip(string.punctuation).replace("'", ' ').split(' ')]
        
        if remove_stopwords:
            tokens = [word for word in tokens if word not in stopwords]

        big_list_of_tokens.extend(tokens)
    
    if normalize == 'lemmatize':
        tagged_tokens= nltk.pos_tag(big_list_of_tokens)
        wordnet_lemmatizer = WordNetLemmatizer()
        lemmatized_words = [wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged_tokens]
        big_list_of_tokens = lemmatized_words 
    elif normalize == 'stem':
        porter_stemmer = PorterStemmer()
        stemmed_words = [porter_stemmer.stem(word) for word in big_list_of_tokens]
        big_list_of_tokens = stemmed_words

    return ' '.join(big_list_of_tokens)

In [163]:
df['description_clean'] = df['description'].apply(lambda x: tokenize(x, remove_stopwords=True , normalize = 'lemmatize'))

In [164]:
df.head()

Unnamed: 0,property_id,listing_id,price,beds,baths,sqft,address,city,state,postal_code,description,description_clean
0,9905345213,2969965000.0,0.22411,0.346154,0.025,0.044713,"210 24th Ave, Seaside Park, NJ 08752",505,0,468,A rare opportunity is yours to own a fabulous ...,rare opportunity fabulous brand-new constructi...
1,5806262520,2978911000.0,0.352531,0.269231,0.055,0.141427,"767 East Ave, Bay Head, NJ 08742",24,0,465,"The residence at 767 East Ave, Bay Head combin...",residence 767 east ave bay head combine elega...
2,9752787852,2960833000.0,0.046682,0.115385,0.02,0.070511,"142 Eagle Way, Eatontown, NJ 07724",143,0,193,The Ridge at Suneagles Condominium offers 60 l...,ridge suneagles condominium offer 60 luxury to...
3,9970508100,2978278000.0,0.050259,0.115385,0.02,0.070511,"174 Eagle Way, Eatontown, NJ 07724",143,0,193,The Ridge at Suneagles Condominium offers 60 l...,ridge suneagles condominium offer 60 luxury to...
4,9071170670,2970837000.0,0.043105,0.115385,0.02,0.070511,"120 Eagle Way, Eatontown, NJ 07724",143,0,193,The Ridge at Suneagles Condominium offers 60 l...,ridge suneagles condominium offer 60 luxury to...


In [165]:
# Show a few original and cleaned descriptions side by side
df[['description', 'description_clean']].sample(5, random_state=1)


Unnamed: 0,description,description_clean
7378,"Welcome to 41 Linden Lane, a quintessential cl...",welcome 41 linden lane quintessential classic...
2779,Warning: you're going to love this. As soon as...,warn go love soon enter stun eight bedroom p...
8768,Welcome to the highly sought-after Parkview Cr...,welcome highly sought-after parkview crossing ...
8216,"Stunning New Construction Home in Newfield, NJ...",stun new construction home newfield nj welco...
8975,Welcome to this charming rancher situated on 1...,welcome charm rancher situate 1.01 acre corner...


##  Feature Engineering

3.2 From Textual Data (NLP Pipeline)

A. TF-IDF Vectorization

In [166]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['description_clean'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())


In [167]:
tfidf_df

Unnamed: 0,000,10,100,11,12,15,18,1st,20,200,...,wow,wrap,yard,year,yet,york,youll,youre,zero,zone
0,0.060101,0.0,0.155178,0.0,0.0,0.0,0.0,0.063112,0.0,0.080013,...,0.0,0.0,0.042363,0.000000,0.0,0.0,0.0,0.0,0.0,0.057092
1,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.068028,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9873,0.108909,0.0,0.140599,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
9874,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
9875,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
9876,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000


In [168]:
import numpy as np

tfidf_scores = tfidf_df.sum().sort_values(ascending=False)
print(tfidf_scores.head(10))


room       808.834305
home       796.978392
floor      558.301099
bedroom    555.698280
new        506.455304
space      480.610543
bath       479.848286
offer      455.657698
feature    424.080177
full       404.683092
dtype: float64


In [169]:
unwanted_chars = ['&', '#', '%', '@', '!', '?']

unwanted_words = [word for word in tfidf_df.columns if any(char in word for char in unwanted_chars)]

print("Unwanted words are:", unwanted_words)


Unwanted words are: []


B. Named Entity Recognition (NER) (using SpaCy)

In [170]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [171]:
def extract_named_entities(text):
    doc = nlp(text)
    return [ent.text.lower() for ent in doc.ents if ent.label_ in ['FAC', 'ORG', 'GPE', 'LOC']]

df['named_entities'] = df['description'].apply(lambda x: extract_named_entities(x))



Entity Label	Description	Relevance to Property Price
FAC	Facilities (e.g., "Statue of Liberty", "Brooklyn Bridge")	Famous landmarks or public facilities often impact nearby property values.
ORG	Organizations (e.g., "Starbucks", "Google")	Presence of businesses, universities, or companies can signal desirability or economic activity.
GPE	Geopolitical Entities (e.g., "New York", "California")	Most important—location, city, or state directly affects price.
LOC	Non-GPE Locations (e.g., "the beach", "downtown")	Descriptive locations often used in listings, affect buyer perception and price.

rest like PERSON, DATE, TIME, MONEY, etc. apperas to be not relevant for our project.

In [172]:
import re

def clean_entities(entities):
    return [ent for ent in entities if re.match(r'^[a-zA-Z\s\.-]+$', ent) and len(ent.strip()) > 1]

df['named_entities'] = df['named_entities'].apply(clean_entities)


In [173]:
# Convert to binary feature if common entities are found
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
ner_df = pd.DataFrame(mlb.fit_transform(df['named_entities']), columns=mlb.classes_)


In [174]:
from spacy import displacy

description_ner = nlp(df['description'].iloc[0])
displacy.render(description_ner, style="ent", jupyter=True)


In [175]:
print(ner_df.shape)
print(ner_df.columns.tolist()[:10])  # Show first 10 columns


(9878, 8978)
['a blue ribbon school', 'a center island', 'a club house', 'a comfort height microwave', 'a cul de sac', 'a cul de sac sitting on meticulously landscape grounds which', 'a de filter', 'a fabulous outdoor option aside', 'a fitness center', 'a flood zone']


C. Sentiment Analysis (VADER)

In [36]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to C:\Users\Ankush
[nltk_data]     Ahuja\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [37]:
sid = SentimentIntensityAnalyzer()

df['sentiment_score'] = df['description'].apply(lambda x: sid.polarity_scores(x)['compound'])


In [38]:
df[['description', 'sentiment_score']].head(10)


Unnamed: 0,description,sentiment_score
0,A rare opportunity is yours to own a fabulous ...,0.9965
1,"The residence at 767 East Ave, Bay Head combin...",0.9944
2,The Ridge at Suneagles Condominium offers 60 l...,0.9457
3,The Ridge at Suneagles Condominium offers 60 l...,0.9457
4,The Ridge at Suneagles Condominium offers 60 l...,0.9457
5,The Ridge at Suneagles Condominium offers 60 l...,0.9457
6,The Ridge at Suneagles Condominium offers 60 l...,0.9457
7,HERE IS THE ONE YOU'VE BEEN WAITING FOR! This ...,0.9966
8,The Pinnacle of Luxury Living on the Manasquan...,0.994
9,"WELCOME TO THIS STUNNING, CUSTOM-BUILT 4, 056 ...",0.9981


In [39]:
i = 0  # change index to explore different rows
print("Description:", df['description'][i])
print("Sentiment score:", df['sentiment_score'][i])


Description: A rare opportunity is yours to own a fabulous brand-new construction luxury home in South Seaside Park with an adjacent guest cottage, overlooking Island Beach State Park! At your oasis, you'll enjoy spectacular views of 3, 000 acres of pristine natural coastline as your daily pleasure. As you enter thru the circa 1840 mahogany gates into the courtyard walking on the beautiful custom porcelain pavers, you can sit down and relax while the 1st course of food is being made on the grill and pizza oven at the bar...If you're ready for the weekend happy hour just continue thru the yard to the fire pit area, sit and enjoy a movie on the screen with the hi-def projector. Of course the main bar is open to serve that much needed cocktail after a long work week This exceptional compound comprised of 2 separately deeded lots sits on ample 100'x100' property with a privacy hedge and offers private access to 24th Ave beach. Upscale amenities were thoughtfully incorporated to optimize yo

In [40]:
def label_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_label'] = df['sentiment_score'].apply(label_sentiment)
print(df['sentiment_label'].value_counts())


sentiment_label
positive    9816
negative      36
neutral       26
Name: count, dtype: int64


Step 4: Model Development

Combine All Features

In [181]:
structured_features = df[['beds', 'baths', 'sqft', 'city', 'state', 'postal_code']]


In [182]:
X_matrix = pd.concat([tfidf_df.reset_index(drop=True), ner_df.reset_index(drop=True)], axis=1)


In [183]:
# Combine all features except sentiment_score
final_features = pd.concat([structured_features.reset_index(drop=True),
                            X_matrix.reset_index(drop=True)], axis=1)


In [184]:
X = final_features

In [185]:
#Target variable
y = df['price']

Phase 1: Baseline Regression Models

1. Random Forest Model without Description Data

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [47]:
X_train, X_test, y_train, y_test = train_test_split( structured_features, y, test_size=0.2, random_state=42)

In [48]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [49]:
# Predictions
y_pred = rf_model.predict(X_test)

In [50]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [51]:
# Print results
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

Mean Squared Error (MSE): 0.0009
Root Mean Squared Error (RMSE): 0.0298
Mean Absolute Error (MAE): 0.0108
R-squared (R²): 0.5341


2. Random Forest Regressor

In [52]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split( X, y, test_size=0.2, random_state=42)

In [53]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_rf, y_train_rf)

y_pred_rf = rf.predict(X_test_rf)

In [54]:
# Evaluation metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [55]:
# Print results
print("Random Forest RMSE:", rmse_rf)
print("Random Forest MSE:", mse_rf)
print("Random Forest MAE:", mae_rf)
print("Random Forest R² Score:", r2_rf)

Random Forest RMSE: 0.024339732315889723
Random Forest MSE: 0.0005924225692091665
Random Forest MAE: 0.009336500971236207
Random Forest R² Score: 0.6892936799936495


Phase 2: Deep Learning & NLP-Aware Models

3. XGBoost (handles dense + sparse features well)

In [186]:
import xgboost as xgb
from xgboost import XGBRegressor

In [121]:
# Ensure y is a Series
if isinstance(y, pd.DataFrame):
    y = y.iloc[:, 0]

In [122]:
# Ensure all X values are numeric 
X = X.select_dtypes(include=[np.number])  # drops non-numeric columns


In [123]:
# Convert to NumPy arrays
X_np = X.values
y_np = y.values

In [199]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42)


In [200]:
# Train XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)


In [201]:
# Predict
y_pred_xgb = xgb.predict(X_test)

In [202]:
# Evaluation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

In [203]:
# Output
print("XGBoost RMSE:", rmse_xgb)
print("XGBoost MSE:", mse_xgb)
print("XGBoost MAE:", mae_xgb)
print("XGBoost R² Score:", r2_xgb)

XGBoost RMSE: 0.021810871634427086
XGBoost MSE: 0.0004757141214534561
XGBoost MAE: 0.00978513802676195
XGBoost R² Score: 0.7505034552461977


4. Keras Model with DistilBERT + Structured Features

In [65]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch




In [66]:
# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
bert_model.eval();  # inference mode

In [67]:
#Step 2: Encode Descriptions with BERT
def get_bert_cls_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token

In [68]:
bert_embeddings = df['description_clean'].apply(get_bert_cls_embedding)
X_bert = np.vstack(bert_embeddings.to_numpy())


In [69]:
#Step 3: Prepare Structured Features and Target
X_structured = final_features.to_numpy()  # already encoded
y = df['price'].to_numpy()


In [70]:
#Step 4: Combine BERT + Structured Features
X_train_bert, X_test_bert, X_train_struct, X_test_struct, y_train_bert, y_test_bert = train_test_split(
    X_bert, X_structured, y, test_size=0.2, random_state=42
)

In [71]:
#Step 5: Build the Keras Model
from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Dropout

# Inputs
bert_input = Input(shape=(768,))
struct_input = Input(shape=(X_structured.shape[1],))




In [72]:
# BERT path
bert_branch = Dense(256, activation='relu')(bert_input)
bert_branch = Dropout(0.3)(bert_branch)

In [73]:
# Structured path
struct_branch = Dense(64, activation='relu')(struct_input)


In [74]:
# Combine both
combined = Concatenate()([bert_branch, struct_branch])
x = Dense(128, activation='relu')(combined)
x = Dropout(0.3)(x)
output = Dense(1)(x)

In [75]:
model = Model(inputs=[bert_input, struct_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 768)]                0         []                            
                                                                                                  
 dense (Dense)               (None, 256)                  196864    ['input_1[0][0]']             
                                                                                                  
 input_2 (InputLayer)        [(None, 9984)]               0         []                            
                                                                                                  
 dropout (Dropout)           (None, 256)                  0         ['dense[0][0]']               
                                                                                             

In [76]:
#Step 6: Train the Model
model.fit([X_train_bert, X_train_struct], y_train_bert,
          validation_data=([X_test_bert, X_test_struct], y_test_bert),
          epochs=10, batch_size=16)


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x16b82c09050>

In [77]:
#Predictions
y_pred_bert = model.predict([X_test_bert, X_test_struct])




In [78]:
bert_mse = mean_squared_error(y_test_bert, y_pred_bert)
bert_mae = mean_absolute_error(y_test_bert, y_pred_bert)
bert_rmse = np.sqrt(mean_squared_error(y_test_bert, y_pred_bert))
bert_r2 = r2_score(y_test_bert, y_pred_bert)

print(f"MAE: {bert_mae:.2f}")
print(f"MSE:", bert_mse)
print(f"RMSE: {bert_rmse:.2f}")
print(f"R² Score: {bert_r2:.4f}")

MAE: 0.02
MSE: 0.0013924203214238221
RMSE: 0.04
R² Score: 0.2697
