Set up the environment

In [None]:
!pip install --upgrade pip
!pip install category_encoders




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# 🏠 Mount Google Drive (if data is stored there)

from google.colab import drive
drive.mount('/content/drive')


# 🧩 Install missing library (run only once)

import sys
!{sys.executable} -m pip install -q category_encoders


# 📦 Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Encoding
import category_encoders as ce


# 📂 Load Dataset


# 🔹 Option 1: if your dataset is in Google Drive (recommended)
# Example: /content/drive/MyDrive/Real Estate Data V21.csv
df = pd.read_csv("/content/drive/MyDrive/Real Estate Data V21.csv")

# 🔹 Option 2: if it’s uploaded directly to Colab
# df = pd.read_csv("/content/Real Estate Data V21.csv")

# Preview first 5 rows
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


Data Cleaning and Preprocessing

In [None]:
import category_encoders as ce

# Identify categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns

# Encode categorical columns using TargetEncoder
encoder = ce.TargetEncoder(cols=categorical_cols)
X[categorical_cols] = encoder.fit_transform(X[categorical_cols], y)

# Now split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


  df[col] = df[col].replace({"Yes": 1, "No": 0}).fillna(0).astype(float)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Total_Area"].fillna(df["Total_Area"].median(), inplace=True)


 Data cleaned successfully!
Shape: (14528, 9)


Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",19900000.0,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4.0,1.0
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,22500000.0,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6.0,1.0
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",10000000.0,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3.0,0.0
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,33300000.0,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5.0,1.0
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",4800000.0,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3.0,1.0


FEATURE SELECTION, SPLIT & ENCODING

In [None]:
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

# Select features and target
feature_cols = ["Total_Area", "Baths", "Balcony", "Location"]
extra_cols = [c for c in ["Swimming Pool", "Gym", "Parking", "Furnished"] if c in df.columns]
feature_cols += extra_cols

X = df[feature_cols].copy()
y = df["Price"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", categorical_cols)

# Encode categorical columns
encoder = TargetEncoder(cols=categorical_cols)
X_train[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
X_test[categorical_cols]  = encoder.transform(X_test[categorical_cols])

print("✅ Encoding completed successfully!")

Categorical columns: ['Location']
Encoding completed successfully!


MODEL TRAINING & EVALUATION

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    results.append([name, r2, rmse, mae])

results_df = pd.DataFrame(results, columns=["Model", "R² Score", "RMSE", "MAE"])
print(results_df)


               Model  R² Score          RMSE           MAE
0  Linear Regression -0.064774  1.444713e+07  6.783075e+06
1      Decision Tree -0.194861  1.530423e+07  6.085601e+06
2      Random Forest  0.009940  1.393104e+07  5.510999e+06
3  Gradient Boosting  0.064684  1.354042e+07  5.769203e+06


training, testing, and comparison

In [None]:
# Encode categorical features
categorical_cols = df.select_dtypes(include=["object"]).columns
if len(categorical_cols) > 0:
    encoder = ce.TargetEncoder(cols=categorical_cols)
    df[categorical_cols] = encoder.fit_transform(df[categorical_cols], df["Price"])


In [None]:
# Imports
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# --- Assume df is already cleaned and loaded here ---

# Encode categorical features
categorical_cols = df.select_dtypes(include=["object"]).columns
if len(categorical_cols) > 0:
    encoder = ce.TargetEncoder(cols=categorical_cols)
    df[categorical_cols] = encoder.fit_transform(df[categorical_cols], df["Price"])

# 1. Define features and target
X = df.drop("Price", axis=1)
y = df["Price"]

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
}

# 4. Train, test and compare
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    results.append([name, r2, rmse, mae])

# 5. Show results
results_df = pd.DataFrame(results, columns=["Model", "R² Score", "RMSE", "MAE"])
print(results_df)


               Model  R² Score           RMSE           MAE
0  Linear Regression  0.999990   43161.833052   6620.863736
1      Decision Tree  0.999863  163746.419451  12660.013765
2      Random Forest  0.999243  385279.078807  21458.661331
3  Gradient Boosting  0.999968   79067.953020  14247.247585


SAVE BEST MODEL + ENCODER

In [None]:
import joblib
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor

# 1 Pick best model — you can change this later if another model performs better
best_model = RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42)

# 2️ Train best model on the FULL dataset (after cleaning + encoding)
best_model.fit(X, y)

# 3️ Save trained model
joblib.dump(best_model, "price_model.pkl")
print("✅ Model trained and saved as price_model.pkl")

# 4️ Save encoder (only if you used TargetEncoder)
encoder = ce.TargetEncoder(cols=categorical_cols)
encoder.fit(df[categorical_cols], df["Price"])
joblib.dump(encoder, "encoder.pkl")
print("✅ Encoder trained and saved as encoder.pkl")

✅ Model trained and saved as price_model.pkl
✅ Encoder trained and saved as encoder.pkl


Load and Predict

In [None]:
import pandas as pd
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
import joblib
import re

# Load dataset
df = pd.read_csv("Real Estate Data V21.csv", on_bad_lines='skip', engine='python')


# --- 1. Convert Yes/No to 1/0 safely ---
for col in ["Balcony", "Baths"]:
    if col in df.columns:
        df[col] = df[col].replace({"Yes": 1, "No": 0})
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df[col] = df[col].fillna(0)

# --- 2. Ensure Total_Area is numeric ---
if "Total_Area" in df.columns:
    df["Total_Area"] = pd.to_numeric(df["Total_Area"], errors="coerce")
    df["Total_Area"].fillna(df["Total_Area"].median(), inplace=True)

# --- 3. Clean Price column robustly ---
def price_to_float(price):
    if isinstance(price, str):
        price = price.replace("₹", "").replace(",", "").strip().lower()
        match = re.search(r"([\d\.]+)\s*(cr|l)?", price)
        if match:
            value = float(match.group(1))
            unit = match.group(2)
            if unit == "cr":
                return value * 1e7
            elif unit == "l":
                return value * 1e5
            else:
                return value
        else:
            return None
    elif isinstance(price, (int, float)):
        return price
    else:
        return None

df["Price"] = df["Price"].apply(price_to_float)
df = df.dropna(subset=["Price"])

# --- 4. Define features and target ---
feature_cols = ["Total_Area", "Baths", "Balcony", "Location", "Name", "Description", "Property Title"]
X = df[feature_cols].copy()
y = df["Price"]

# --- 5. Encode categorical features ---
categorical_cols = ["Location", "Name", "Description", "Property Title"]
encoder = ce.TargetEncoder(cols=categorical_cols)
X[categorical_cols] = encoder.fit_transform(X[categorical_cols], y)

# --- 6. Train Random Forest model ---
model = RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42)
model.fit(X, y)

# --- 7. Save model and encoder ---
joblib.dump(model, "price_model.pkl")
joblib.dump(encoder, "encoder.pkl")

print("Model and encoder trained and saved successfully!")


  df[col] = df[col].replace({"Yes": 1, "No": 0})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Total_Area"].fillna(df["Total_Area"].median(), inplace=True)


Model and encoder trained and saved successfully!


In [None]:
# Ensure X_test has same columns as X used for training
X_test_model = X_test[feature_cols].copy()

# Transform categorical columns
X_test_model.loc[:, categorical_cols] = encoder.transform(X_test_model[categorical_cols])

# Predict
y_test_pred = model.predict(X_test_model)

# Evaluate
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred) ** 0.5

print("Test R²:", r2_test)
print("Test MAE:", mae_test)
print("Test RMSE:", rmse_test)


Test R²: -0.0007561638222575162
Test MAE: 7729434.618031658
Test RMSE: 14006094.367440464


In [None]:
import pandas as pd
import joblib
import category_encoders as ce

# Load saved model and encoder
model = joblib.load("price_model.pkl")
encoder = joblib.load("encoder.pkl")

# List of new properties (example: 2 properties)
new_data = pd.DataFrame([
    {
        "Total_Area": 1500,
        "Baths": 2,
        "Balcony": 1,
        "Location": "Rampura, Bangalore",
        "Name": "Apartment A",
        "Description": "2BHK near park",
        "Property Title": "Luxury Apartment"
    },
    {
        "Total_Area": 2000,
        "Baths": 3,
        "Balcony": 2,
        "Location": "MG Road, Bangalore",
        "Name": "Apartment B",
        "Description": "3BHK with gym",
        "Property Title": "Premium Apartment"
    }
])

# Encode categorical columns
categorical_cols = ["Location", "Name", "Description", "Property Title"]
new_data[categorical_cols] = encoder.transform(new_data[categorical_cols])

# Predict prices
predicted_prices = model.predict(new_data)

# Show results
for i, price in enumerate(predicted_prices):
    print(f"Property {i+1} predicted price: ₹{price:,.2f}")


Property 1 predicted price: ₹10,699,666.67
Property 2 predicted price: ₹10,699,666.67


Evaluate Model Performance

In [None]:
import joblib

joblib.dump(model, "price_model.pkl")
joblib.dump(encoder, "encoder.pkl")


['encoder.pkl']

In [None]:
import os
os.listdir()


['.config',
 'price_model.pkl',
 'drive',
 'encoder.pkl',
 'Real Estate Data V21.csv',
 'sample_data']

In [None]:
for col in ["Balcony", "Baths"]:
    if col in df.columns:
        df[col] = df[col].replace({"Yes": 1, "No": 0}).astype(float)
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)


In [None]:
# Ensure Total_Area is numeric and fill missing with median
if "Total_Area" in df.columns:
    df["Total_Area"] = pd.to_numeric(df["Total_Area"], errors="coerce").fillna(df["Total_Area"].median())

# Clean Price column robustly
import re

def price_to_float(price):
    if isinstance(price, str):
        price = price.replace("₹", "").replace(",", "").strip().lower()
        match = re.search(r"([\d\.]+)\s*(cr|l)?", price)
        if match:
            value = float(match.group(1))
            unit = match.group(2)
            if unit == "cr":
                return value * 10000000
            elif unit == "l":
                return value * 100000
            else:
                return value
        else:
            return None
    elif isinstance(price, (int, float)):
        return price
    else:
        return None

df["Price"] = df["Price"].apply(price_to_float)

# Drop rows with invalid Price
df = df.dropna(subset=["Price"])


In [None]:
#Step 11.5: Train-Test Split (rerun before tuning)
from sklearn.model_selection import train_test_split

feature_cols = ["Total_Area", "Baths", "Balcony", "Location", "Name", "Description", "Property Title"]
X = df[feature_cols].copy()
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


data preprocessing and feature setup

In [None]:
# --- Step: Encode Yes/No Columns ---
binary_cols = ["Swimming Pool", "Gym", "Parking", "Furnished"]  # update with actual Yes/No columns in your dataset

for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].replace({"Yes": 1, "No": 0}).fillna(0)


In [None]:
#  Step: Define Feature Columns Safely
base_cols = ["Total_Area", "Baths", "Balcony", "Location", "Name",
             "Description", "Property Title"]

# Add only binary Yes/No columns that exist
binary_cols = [col for col in ["Swimming Pool", "Gym", "Parking", "Furnished"] if col in df.columns]

# Final feature list
feature_cols = base_cols + binary_cols

X = df[feature_cols].copy()
y = df["Price"]

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import category_encoders as ce

categorical_cols = ["Location", "Name", "Description", "Property Title"]
encoder = ce.TargetEncoder(cols=categorical_cols)
X_train[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
X_test[categorical_cols] = encoder.transform(X_test[categorical_cols])


In [None]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib
import re

# --- 1. Load dataset ---
df = pd.read_csv("Real Estate Data V21.csv")

# --- 2. Clean Price column ---
def price_to_float(price):
    if isinstance(price, str):
        price = price.replace("₹", "").replace(",", "").strip().lower()
        match = re.search(r"([\d\.]+)\s*(cr|l)?", price)
        if match:
            value = float(match.group(1))
            unit = match.group(2)
            if unit == "cr":
                return value * 10000000
            elif unit == "l":
                return value * 100000
            else:
                return value
        else:
            return None
    elif isinstance(price, (int, float)):
        return price
    else:
        return None

df["Price"] = df["Price"].apply(price_to_float)
df = df.dropna(subset=["Price"])

#  3. Convert Yes/No columns to numeric
binary_cols = ["Balcony", "Baths", "Swimming Pool", "Gym", "Parking", "Furnished"]
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].replace({"Yes": 1, "No": 0}).fillna(0)
        df[col] = pd.to_numeric(df[col], errors="coerce")

#  4. Ensure numeric columns are proper
if "Total_Area" in df.columns:
    df["Total_Area"] = pd.to_numeric(df["Total_Area"], errors="coerce").fillna(df["Total_Area"].median())

# 5. Define features and target
base_cols = ["Total_Area", "Baths", "Balcony", "Location", "Name", "Description", "Property Title"]
existing_binary_cols = [col for col in ["Swimming Pool", "Gym", "Parking", "Furnished"] if col in df.columns]
feature_cols = base_cols + existing_binary_cols

X = df[feature_cols].copy()
y = df["Price"]

# 6. Encode categorical columns
categorical_cols = [col for col in ["Location", "Name", "Description", "Property Title"] if col in X.columns]
if categorical_cols:
    encoder = ce.TargetEncoder(cols=categorical_cols)
    X[categorical_cols] = encoder.fit_transform(X[categorical_cols], y)

#  7. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. Train Random Forest
model = RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42)
model.fit(X_train, y_train)

# 9. Predictions and Evaluation
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("R²:", r2, "RMSE:", rmse, "MAE:", mae)

# 10. Save model and encoder
joblib.dump(model, "price_model.pkl")
if categorical_cols:
    joblib.dump(encoder, "encoder.pkl")
print("Model and encoder saved successfully.")


  df[col] = df[col].replace({"Yes": 1, "No": 0}).fillna(0)


R²: 0.9994834236833834 RMSE: 318214.6405145357 MAE: 20468.406744666183
Model and encoder saved successfully.


overfitted one into a regularized

In [None]:
import pandas as pd

# Load dataset again
df = pd.read_csv("/content/Real Estate Data V21.csv")

# Define feature columns (drop target and non-relevant columns)
feature_cols = [col for col in df.columns if col not in ["Price", "Name", "Property Title", "Description"]]

# Features and target
X = df[feature_cols].copy()
y = df["Price"]


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# 3. Encode Yes/No columns
binary_cols = ["Balcony","Baths","Swimming Pool","Gym","Parking","Furnished"]
for col in binary_cols:
    if col in df.columns:
        df[col] = (
            df[col]
            .replace({"Yes": 1, "No": 0})
            .fillna(0)
            .astype(float)
            .infer_objects(copy=False)
        )


  .replace({"Yes": 1, "No": 0})
