<a href="https://colab.research.google.com/github/AabidMK/RealtyAI_Infosys_Internship_Aug2025/blob/Pavithra_Kopuru/bagging_model_joblib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib
import os

In [None]:
df.columns = df.columns.str.strip()
def clean_price(price):
    price = str(price).replace('₹', '').replace(',', '').strip()
    try:
        if 'Cr' in price:
            return float(price.replace('Cr', '').strip()) * 100
        elif 'L' in price:
            return float(price.replace('L', '').strip())
        return float(price)
    except ValueError:
        return np.nan
df['Price_Lakhs'] = df['Price'].apply(clean_price)
df.dropna(subset=['Price_Lakhs'], inplace=True)
TARGET = "Price_Lakhs"
X = df.drop(columns=["Price", TARGET])
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Training Data Shape:", X_train.shape)
print("Test Data Shape:", X_test.shape)

Training Data Shape: (11620, 8)
Test Data Shape: (2906, 8)


In [None]:
display(df.head())

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,Price_Lakhs
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,199.0
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes,225.0
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No,100.0
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes,333.0
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,48.0


In [None]:
# Make predictions on the test data
y_pred = pipeline.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

Mean Squared Error: 8596.47
Root Mean Squared Error: 92.72
R-squared Score: 0.70


In [None]:
# Separate column types
numeric_features = ["Total_Area", "Price_per_SQFT", "Baths"]
categorical_features = ["Location", "Balcony"]
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
bagging_model = BaggingRegressor(
    estimator=DecisionTreeRegressor(random_state=42),
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", bagging_model)
])

In [None]:
print("Columns in X_train before fitting pipeline:")
print(X_train.columns)
pipeline.fit(X_train, y_train)

Columns in X_train before fitting pipeline:
Index(['Name', 'Property Title', 'Location', 'Total_Area', 'Price_per_SQFT',
       'Description', 'Baths', 'Balcony'],
      dtype='object')


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, median_absolute_error
print("\n COMPREHENSIVE MODEL EVALUATION")
print(f"   Using same train/test split from training:")
print(f"   Train samples: {len(X_train)}, Test samples: {len(X_test)}")
print(" Transforming data...")
X_train_trans = pipeline.named_steps["preprocessor"].fit_transform(X_train)
print(f" Transformed data shape: {X_train_trans.shape}")
print(" Transforming data...")
X_test_trans = pipeline.named_steps["preprocessor"].transform(X_test)
print(f" Transformed data shape: {X_test_trans.shape}")
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
overfit_gap = (train_r2 - test_r2) * 100

mae = mean_absolute_error(y_test, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("    PERFORMANCE METRICS:")
print(f"   Train R²: {train_r2:.4f} ({train_r2*100:.1f}%)")
print(f"   Test R²: {test_r2:.4f} ({test_r2*100:.1f}%)")
print(f"   Overfitting Gap: {overfit_gap:.2f}%")
print(f"   Test MAE: ₹{mae:.2f}L")
print(f"   Test RMSE: ₹{rmse:.2f}L")
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring="r2", n_jobs=-1)
print("\n CROSS-VALIDATION ANALYSIS:")
print(f"   CV R² Scores: {[f'{s:.4f}' for s in cv_scores]}")
print(f"   Mean CV R²: {cv_scores.mean():.4f}")
print(f"   CV Std Dev: {cv_scores.std():.4f}")
print(f"   Model Stability: {'Excellent' if cv_scores.std() < 0.01 else 'Good'}")
med_abs_error = median_absolute_error(y_test, y_test_pred)
errors = np.abs((y_test - y_test_pred) / y_test) * 100  # percentage errors

median_error_pct = np.median(errors)
p90_error = np.percentile(errors, 90)

within_15 = np.mean(errors <= 15) * 100
within_25 = np.mean(errors <= 25) * 100

print("\n ERROR ANALYSIS:")
print(f"   Median Absolute Error: ₹{med_abs_error:.2f}L")
print(f"   Median Error %: {median_error_pct:.1f}%")
print(f"   90th Percentile Error: {p90_error:.1f}%")
print(f"   Predictions within 15%: {within_15:.1f}%")
print(f"   Predictions within 25%: {within_25:.1f}%")


 COMPREHENSIVE MODEL EVALUATION
   Using same train/test split from training:
   Train samples: 11620, Test samples: 2906
 Transforming data...
 Transformed data shape: (11620, 6058)
 Transforming data...
 Transformed data shape: (2906, 6058)
    PERFORMANCE METRICS:
   Train R²: 0.9400 (94.0%)
   Test R²: 0.8626 (86.3%)
   Overfitting Gap: 7.74%
   Test MAE: ₹4.60L
   Test RMSE: ₹62.58L

 CROSS-VALIDATION ANALYSIS:
   CV R² Scores: ['0.9939', '0.6974', '0.4601', '0.6130', '0.6724']
   Mean CV R²: 0.6873
   CV Std Dev: 0.1741
   Model Stability: Good

 ERROR ANALYSIS:
   Median Absolute Error: ₹0.20L
   Median Error %: 0.3%
   90th Percentile Error: 1.8%
   Predictions within 15%: 98.6%
   Predictions within 25%: 99.0%


In [None]:
os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/real_estate_pipeline_bagging.joblib")
print("💾 Model saved as: models/real_estate_pipeline_bagging.joblib")

💾 Model saved as: models/real_estate_pipeline_bagging.joblib
