In [1]:
#import neccessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib


In [4]:
#load the dataset
df = pd.read_csv("/content/Real Estate Data V21.csv", engine="python", on_bad_lines='skip')


In [6]:
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,Price_in_Lakhs
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,199.0
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes,225.0
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No,100.0
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes,333.0
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,48.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11188 entries, 0 to 11187
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            11188 non-null  object 
 1   Property Title  11188 non-null  object 
 2   Price           11188 non-null  object 
 3   Location        11188 non-null  object 
 4   Total_Area      11188 non-null  int64  
 5   Price_per_SQFT  11188 non-null  float64
 6   Description     11188 non-null  object 
 7   Baths           11188 non-null  int64  
 8   Balcony         11188 non-null  object 
 9   Price_in_Lakhs  11188 non-null  float64
dtypes: float64(2), int64(2), object(6)
memory usage: 874.2+ KB


In [8]:
df.isnull().sum()

Unnamed: 0,0
Name,0
Property Title,0
Price,0
Location,0
Total_Area,0
Price_per_SQFT,0
Description,0
Baths,0
Balcony,0
Price_in_Lakhs,0


In [5]:
def parse_price(price_str):
    if pd.isna(price_str): return np.nan
    price_str = str(price_str).replace('₹','').replace(',','').strip()
    num = re.findall(r'[\d.]+', price_str)
    if not num: return np.nan
    val = float(num[0])
    if 'Cr' in price_str: return val * 100
    if 'L' in price_str: return val
    return val/100000 if val > 10000 else val

df['Price_in_Lakhs'] = df['Price'].apply(parse_price)

In [9]:
# Numeric conversion
df['Total_Area'] = pd.to_numeric(df['Total_Area'], errors='coerce').fillna(df['Total_Area'].median())
df['Price_per_SQFT'] = pd.to_numeric(df['Price_per_SQFT'], errors='coerce').fillna(df['Price_per_SQFT'].median())
df['Baths'] = pd.to_numeric(df['Baths'], errors='coerce').fillna(1)

In [10]:
# Remove extreme outliers
for col in ['Total_Area', 'Price_in_Lakhs', 'Price_per_SQFT']:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df = df[(df[col] >= lower) & (df[col] <= upper)]

In [11]:

# Feature engineering
df['log_area'] = np.log1p(df['Total_Area'])
df['Area_per_Room'] = df['Total_Area'] / np.maximum(df['Baths'], 1)

# Location split
df['Location_split'] = df['Location'].str.split(',')
df['Locality'] = df['Location_split'].apply(lambda x: x[0].strip() if x and len(x) > 0 else 'Unknown')
df['City'] = df['Location_split'].apply(lambda x: x[-1].strip() if x and len(x) > 0 else 'Unknown')

top_localities = df['Locality'].value_counts().nlargest(30).index.tolist()
df.loc[~df['Locality'].isin(top_localities), 'Locality'] = 'Other'


In [12]:
#Features
numeric_features = ['Total_Area', 'Price_per_SQFT', 'Baths', 'log_area', 'Area_per_Room']
categorical_features = ['City', 'Locality']

X = df[numeric_features + categorical_features]
y = df['Price_in_Lakhs']

In [13]:
#  Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [14]:
#random forest pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=700,
        max_depth=13,
        min_samples_split=5,
        min_samples_leaf=3,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    ))
])

In [16]:
#train and test split
X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [17]:
#pipeline fitting
pipeline.fit(X_train_global, y_train_global)

In [18]:
y_train_pred = pipeline.predict(X_train_global)
y_test_pred = pipeline.predict(X_test_global)


In [19]:
#model evalution and metrics
train_r2 = r2_score(y_train_global, y_train_pred)
test_r2 = r2_score(y_test_global, y_test_pred)
train_mae = mean_absolute_error(y_train_global, y_train_pred)
test_mae = mean_absolute_error(y_test_global, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train_global, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test_global, y_test_pred))

overfitting_gap = train_r2 - test_r2
overfitting_pct = (overfitting_gap / train_r2) * 100 if train_r2 != 0 else 0

print("COMPREHENSIVE MODEL EVALUATION (Random Forest Simplified & Optimized)")
print(f"Train samples: {len(X_train_global)}, Test samples: {len(X_test_global)}")
print(f"Train R²: {train_r2:.4f} ({train_r2*100:.1f}%)")
print(f"Test R²: {test_r2:.4f} ({test_r2*100:.1f}%)")
print(f"Overfitting Gap: {overfitting_pct:.2f}%")
print(f"Test MAE: ₹{test_mae:.2f}L")
print(f"Test RMSE: ₹{test_rmse:.2f}L")

COMPREHENSIVE MODEL EVALUATION (Random Forest Simplified & Optimized)
Train samples: 8472, Test samples: 2118
Train R²: 0.9141 (91.4%)
Test R²: 0.8975 (89.8%)
Overfitting Gap: 1.82%
Test MAE: ₹13.32L
Test RMSE: ₹25.25L


In [20]:
#cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
try:
    stratify_col = df['City']
except:
    stratify_col = None

data_aligned = pd.concat([X_train_global, X_test_global])
y_all = pd.concat([y_train_global, y_test_global])

cv_scores = cross_val_score(pipeline, data_aligned, y_all, cv=5, scoring='r2', n_jobs=-1)
print("\nCROSS-VALIDATION ANALYSIS")
print(f"CV R² Scores: {[f'{score:.4f}' for score in cv_scores]}")
print(f"Mean CV R²: {cv_scores.mean():.4f}")
print(f"CV Std Dev: {cv_scores.std():.4f}")

stability = "Excellent" if cv_scores.std() < 0.02 else "Good" if cv_scores.std() < 0.05 else "Moderate"
print(f"Model Stability: {stability}")


CROSS-VALIDATION ANALYSIS
CV R² Scores: ['0.8866', '0.9083', '0.8866', '0.9004', '0.8975']
Mean CV R²: 0.8959
CV Std Dev: 0.0084
Model Stability: Excellent


In [22]:
#save the pipeline
joblib.dump(pipeline, "real_estate_pipeline.pkl")
joblib.dump({'top_localities': top_localities}, "real_estate_pipeline_info.pkl")

print("\nPipeline and auxiliary info saved successfully!")
print("Load later using:")
print("pipeline = joblib.load('real_estate_pipeline.pkl')")
print("top_localities = joblib.load('real_estate_pipeline_info.pkl')['top_localities']")


Pipeline and auxiliary info saved successfully!
Load later using:
pipeline = joblib.load('real_estate_pipeline.pkl')
top_localities = joblib.load('real_estate_pipeline_info.pkl')['top_localities']
