In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from category_encoders import TargetEncoder   # pip install category-encoders


In [2]:
pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
Installing collected packages: category_encoders
Successfully installed category_encoders-2.8.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv("Real Estate Data.csv")


In [4]:
def price_in_lakhs(val):
    val = val.replace("₹", "").replace(",", "").strip()
    if "Cr" in val:
        return float(re.findall(r"[\d.]+", val)[0]) * 100    # 1 Cr = 100 Lakhs
    if "L" in val:
        return float(re.findall(r"[\d.]+", val)[0])          # already in Lakhs
    return np.nan

df["Price_Lakhs"] = df["Price"].apply(price_in_lakhs)


In [5]:
df[["Area","City"]] = df["Location"].str.rsplit(",", n=1, expand=True)
df["Area"] = df["Area"].str.strip()
df["City"] = df["City"].str.strip()


In [6]:
def extract_bhk(txt):
    match = re.search(r"(\d+)\s*BHK", str(txt))
    return int(match.group(1)) if match else np.nan

df["BHK"] = df["Property Title"].apply(extract_bhk)


In [7]:
num_cols = ["Total_Area", "Price_per_SQFT", "Baths", "BHK"]
for col in num_cols:
    df[f"log_{col}"] = np.log1p(df[col])


In [9]:
# --- Ensure target column has no missing values ---
df = df.dropna(subset=["Price_Lakhs"]).reset_index(drop=True)

# --- Ensure categorical columns have no missing values ---
df["Area"] = df["Area"].fillna("Unknown")
df["City"] = df["City"].fillna("Unknown")

# --- Target (mean) encode using simple groupby mapping ---
for col in ["Area", "City"]:
    mean_map = df.groupby(col)["Price_Lakhs"].mean()
    df[col] = df[col].map(mean_map)



In [10]:
from sklearn.preprocessing import StandardScaler

scale_cols = [f"log_{c}" for c in ["Total_Area", "Price_per_SQFT", "Baths", "BHK"]]
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# ---------- 1️⃣  Select Features & Target ----------
X = df[["Area", "City"] + scale_cols]
y = df["Price_Lakhs"]

# ---------- 2️⃣  Remove rows where target is missing ----------
# (Important if any Price_Lakhs rows were dropped earlier)
mask = y.notna()
X = X.loc[mask]
y = y.loc[mask]

# ---------- 3️⃣  Impute any NaNs in features ----------
imputer = SimpleImputer(strategy="median")   # or "mean"
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)

# ---------- 4️⃣  Train/Test Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------- 5️⃣  Train Linear Regression ----------
lr = LinearRegression()
lr.fit(X_train, y_train)

# ---------- 6️⃣  Predict ----------
y_pred = lr.predict(X_test)



In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse  = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f} Lakhs")
print(f"MAE  : {mae:.2f} Lakhs")
print(f"R²   : {r2:.3f}")


MSE  : 9610.63
RMSE : 98.03 Lakhs
MAE  : 38.86 Lakhs
R²   : 0.800
