In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/Real Estate Data V21.csv")
print( df.shape)


FileNotFoundError: [Errno 2] No such file or directory: '/content/Real Estate Data V21.csv'

In [None]:
!pip install category_encoders




In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder


# Clean price column
def clean_price(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip().upper()

    if "CR" in x:  #  if price is in Crores
        match = re.findall(r"[\d\.]+", x)
        if match:
            return float(match[0]) * 10000000 # we convert in numericals
    elif "L" in x:  # if prices is in Lakhs
        match = re.findall(r"[\d\.]+", x)
        if match:
            return float(match[0]) * 100000
    else:  # if price is in rupees
        match = re.findall(r"[\d\.]+", x)
        if match:
            return float(match[0])

    return np.nan

# Extract number of rooms

def extract_rooms(text):
    text = str(text).lower()
    match = re.search(r'(\d+(\.\d+)?)\s*bhk', text)# its for bhk
    if match:
        return float(match.group(1))
    match = re.search(r'(\d+)\s*bed(room)?', text)# for bedroom
    if match:
        return int(match.group(1))
    match = re.search(r'(\d+)\s*rk', text)# for rk
    if match:
        return int(match.group(1))
    return None

# Apply cleaning
df['property_title'] = df['Property Title'].astype(str).str.lower()
df['num_rooms'] = df['property_title'].apply(extract_rooms)
df['Price'] = df['Price'].apply(clean_price)
df['Price'] = df['Price'] / 100000  # Convert to Lakhs so that its better to perform calculations

#  Handle missing values

num_cols = df.select_dtypes(include=["int64", "float64"]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())# we use median value to replace missing values.

# Split location into City & Area
df['City'] = df['Location'].apply(lambda x: x.split(',')[-1].strip() if pd.notna(x) else x)
df['Area'] = df['Location'].apply(lambda x: x.split(',')[0].strip() if pd.notna(x) else x)

# usage of Target Encoding
te = TargetEncoder()
df['Location_encoded'] = te.fit_transform(df['Location'], df['Price'])
df['Balcony_encoded'] = te.fit_transform(df['Balcony'], df['Price'])
df['City_encoded'] = te.fit_transform(df['City'], df['Price'])
df['Area_encoded'] = te.fit_transform(df['Area'], df['Price'])

# Feature selection
features = [
    'Total_Area', 'Price_per_SQFT', 'Baths', 'num_rooms',
    'Location_encoded', 'Balcony_encoded', 'City_encoded', 'Area_encoded'
]

num_features = ['Total_Area', 'Price_per_SQFT', 'Baths', 'num_rooms']

# Apply log1p to numeric features
df[num_features] = df[num_features].apply(lambda x: np.log1p(x))

X = df[features].copy()
y = np.log1p(df['Price'])  # log transform target

# Scale numeric features (scaling)
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

print(" Preprocessing done. Now X and y are ready for training.")


 Preprocessing done. Now X and y are ready for training.


Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Splitting Train-test data ie 20 % for testing and 80% for traning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear refression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predictions
y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

# Metrics of linear regression
print("Linear Regression - Train:")
print(" R²:", r2_score(y_train, y_train_pred))
print(" MAE:", mean_absolute_error(y_train, y_train_pred))
print(" RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))

print("\nLinear Regression - Test:")
print(" R²:", r2_score(y_test, y_test_pred))
print(" MAE:", mean_absolute_error(y_test, y_test_pred))
print(" RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))


Linear Regression - Train:
 R²: 0.7389607449433809
 MAE: 0.23812032152697027
 RMSE: 0.4479769284637357

Linear Regression - Test:
 R²: 0.7838239412710939
 MAE: 0.22531496264723874
 RMSE: 0.3988310045135304


Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt = DecisionTreeRegressor(max_depth=10, random_state=42)
dt.fit(X_train, y_train)

y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

print("Decision Tree - Train:")
print(" R²:", r2_score(y_train, y_train_pred))
print(" MAE:", mean_absolute_error(y_train, y_train_pred))
print(" RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))

print("\nDecision Tree - Test:")
print(" R²:", r2_score(y_test, y_test_pred))
print(" MAE:", mean_absolute_error(y_test, y_test_pred))
print(" RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))


Decision Tree - Train:
 R²: 0.9940873180086722
 MAE: 0.046798271715229336
 RMSE: 0.06742097018838518

Decision Tree - Test:
 R²: 0.9712051359346269
 MAE: 0.06603492719750219
 RMSE: 0.1455601958849262


RandomForest

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [None]:
#importing library and models
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from category_encoders import TargetEncoder

# Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor,
    GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
)
from xgboost import XGBRegressor

In [None]:
#load the dataset
df = pd.read_csv("/content/Real Estate Data V21.csv")

Data Cleaning Function
and room Extraction

In [None]:
# ============================================
# Data Cleaning Functions
# ============================================

def clean_price(x):
    """Convert Price strings (Cr/Lakh/₹) into numeric (₹)."""
    if pd.isna(x):
        return np.nan
    x = str(x).strip().upper()
    if "CR" in x:
        match = re.findall(r"[\d\.]+", x)
        if match:
            return float(match[0]) * 10000000
    elif "L" in x:
        match = re.findall(r"[\d\.]+", x)
        if match:
            return float(match[0]) * 100000
    else:
        match = re.findall(r"[\d\.]+", x)
        if match:
            return float(match[0])
    return np.nan

def extract_rooms(text):
    """Extract number of rooms from property title text."""
    text = str(text).lower()
    match = re.search(r'(\d+(\.\d+)?)\s*bhk', text)
    if match:
        return float(match.group(1))
    match = re.search(r'(\d+)\s*bed(room)?', text)
    if match:
        return int(match.group(1))
    match = re.search(r'(\d+)\s*rk', text)
    if match:
        return int(match.group(1))
    return None


In [None]:
# ============================================
# Preprocessing
# ============================================

# Extract rooms from property title
df['property_title'] = df['Property Title'].astype(str).str.lower()
df['num_rooms'] = df['property_title'].apply(extract_rooms)

# Clean price and convert to Lakhs
df['Price'] = df['Price'].apply(clean_price) / 100000

# Remove outliers
df = df[df['Price'] <= 200]

# Fill missing numeric values with median
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Split Location into City & Area
df['City'] = df['Location'].apply(lambda x: x.split(',')[-1].strip() if pd.notna(x) else x)
df['Area'] = df['Location'].apply(lambda x: x.split(',')[0].strip() if pd.notna(x) else x)


Target Encoder


In [None]:
# ============================================
# Target Encoding
# ============================================

te = TargetEncoder()
df['Location_encoded'] = te.fit_transform(df['Location'], df['Price'])
df['Balcony_encoded'] = te.fit_transform(df['Balcony'], df['Price'])
df['City_encoded'] = te.fit_transform(df['City'], df['Price'])
df['Area_encoded'] = te.fit_transform(df['Area'], df['Price'])


Feature Selection and Scaling

In [None]:
# ============================================
# Features, Log-transform & Scaling
# ============================================

features = [
    'Total_Area', 'Price_per_SQFT', 'Baths', 'num_rooms',
    'Location_encoded', 'Balcony_encoded', 'City_encoded', 'Area_encoded'
]
num_features = ['Total_Area', 'Price_per_SQFT', 'Baths', 'num_rooms']

# Log-transform numeric features
df[num_features] = df[num_features].apply(lambda x: np.log1p(x))

X = df[features].copy()
y = np.log1p(df['Price'])  # log-transform target

# Scale numeric features
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])


In [None]:
# ============================================
# Train-Test Split
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Train-Test Split

Define Models


In [None]:

# ============================================
# Define Models
# ============================================

models = {
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=200, random_state=42),
    "Bagging": BaggingRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42, objective="reg:squarederror")
}


Training and evaluation

In [None]:
# ============================================
# Training & Evaluation
# ============================================

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)

    # Train predictions
    y_train_pred = model.predict(X_train)
    y_train_true = np.expm1(y_train)
    y_train_pred_true = np.expm1(y_train_pred)

    # Test predictions
    y_test_pred = model.predict(X_test)
    y_test_true = np.expm1(y_test)
    y_test_pred_true = np.expm1(y_test_pred)

    # Metrics
    train_r2 = r2_score(y_train_true, y_train_pred_true)
    test_r2 = r2_score(y_test_true, y_test_pred_true)
    train_mae = mean_absolute_error(y_train_true, y_train_pred_true)
    test_mae = mean_absolute_error(y_test_true, y_test_pred_true)
    train_rmse = np.sqrt(mean_squared_error(y_train_true, y_train_pred_true))
    test_rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred_true))

    results[name] = {
        "Train_R2(%)": round(train_r2 * 100, 2),
        "Test_R2(%)": round(test_r2 * 100, 2),
        "Overfit_Gap(%)": round((train_r2 - test_r2) * 100, 2),
        "Train_MAE (Lakhs)": round(train_mae, 2),
        "Test_MAE (Lakhs)": round(test_mae, 2),
        "Train_RMSE (Lakhs)": round(train_rmse, 2),
        "Test_RMSE (Lakhs)": round(test_rmse, 2),
    }

results_df = pd.DataFrame(results).T.sort_values(by="Test_R2(%)", ascending=False)
print("\n================== Optimized Model Performance ==================")
print(results_df)



                  Train_R2(%)  Test_R2(%)  Overfit_Gap(%)  Train_MAE (Lakhs)  \
RandomForest            99.74       99.24            0.50               0.58   
Bagging                 99.75       99.23            0.52               0.58   
XGBoost                 99.93       99.06            0.87               0.76   
GradientBoosting        99.05       98.97            0.08               2.38   
ExtraTrees             100.00       98.57            1.43               0.00   
DecisionTree           100.00       97.81            2.19               0.00   
AdaBoost                54.79       54.59            0.20              18.81   

                  Test_MAE (Lakhs)  Train_RMSE (Lakhs)  Test_RMSE (Lakhs)  
RandomForest                  1.32                2.24               3.81  
Bagging                       1.33                2.21               3.84  
XGBoost                       2.01                1.16               4.24  
GradientBoosting              2.57                4.31