# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Import Dataset

In [2]:
df = pd.read_csv("Real Estate Data V21.csv")

# Basic Data Exploration

In [3]:
print("Shape of dataset:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData types:\n", df.dtypes)  
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicate values:",df.duplicated().sum())  #gives exact  duplicate rows
print("\nUnique values:\n",df.nunique())

Shape of dataset: (14528, 9)

Columns: ['Name', 'Property Title', 'Price', 'Location', 'Total_Area', 'Price_per_SQFT', 'Description', 'Baths', 'Balcony']

Data types:
 Name               object
Property Title     object
Price              object
Location           object
Total_Area          int64
Price_per_SQFT    float64
Description        object
Baths               int64
Balcony            object
dtype: object

Missing values:
 Name              0
Property Title    0
Price             0
Location          0
Total_Area        0
Price_per_SQFT    0
Description       0
Baths             0
Balcony           0
dtype: int64

Duplicate values: 8

Unique values:
 Name               9998
Property Title     6507
Price               891
Location           7050
Total_Area         1774
Price_per_SQFT     2094
Description       14490
Baths                 6
Balcony               2
dtype: int64


# Data Cleaning

In [4]:
def data_cleaning(df):

    # 1. Price conversion to Lakhs
    def convert_price_to_lakhs(price_str):
        price_str = str(price_str).replace(",", "").strip()
        match = re.match(r'₹?\s*([\d]+(?:\.\d+)?)\s*(L|Cr|Crore|Crores)?', price_str, re.IGNORECASE)
        if match:
            value = float(match.group(1))
            unit = match.group(2)
            if unit and unit.lower().startswith("c"):   # Crore
                value *= 100
            elif unit and unit.lower().startswith("l"): # Lakh
                value *= 1
            else:  # Assume raw number in Rupees
                value /= 1e5
            return value
        return np.nan

    df["Price_Lakhs"] = df["Price"].apply(convert_price_to_lakhs)

    # 2. Split Location into Locality & City
    df['Location_split'] = df['Location'].str.split(',')
    df['Locality'] = df['Location_split'].apply(lambda x: x[0].strip().lower() if x and len(x) > 0 else '')
    df['City'] = df['Location_split'].apply(lambda x: x[-1].strip().lower() if x and len(x) > 0 else '')
    df = df.drop(columns=["Location_split"])

    # 3. Extract BHK and RK
    def extract_category(title):
        if pd.isna(title):
            return None, None
    
        title = str(title)
    
        # Check for BHK
        bhk_match = re.search(r'(\d+)\s*BHK', title, re.IGNORECASE)
        if bhk_match:
            num = int(bhk_match.group(1))
            return "BHK", f"{num} BHK"
    
        # Check for RK
        rk_match = re.search(r'(\d+)?\s*RK', title, re.IGNORECASE)
        if rk_match:
            num = rk_match.group(1)
            num = int(num) if num else 1   # if just "RK", assume 1 RK
            return "RK", f"{num} RK"
    
        return None, None

    df[["bhk_or_rk", "bhk_category"]] = df["Property Title"].apply(
        lambda x: pd.Series(extract_category(x))
    )

    # 3.1 Extract numeric BHK/RK as integer (new column)
    def extract_num(category):
        if pd.isna(category):
            return np.nan
        match = re.search(r'(\d+)', str(category))
        return int(match.group(1)) if match else np.nan

    df["BHK"] = df["bhk_category"].apply(extract_num)

    # 4. Create binary flags
    df['Balcony_flag'] = df['Balcony'].map({'Yes': 1, 'Y': 1, 'No': 0, 'N': 0}).fillna(0)
    df['BHK_or_RK_flag'] = df['bhk_or_rk'].map({'BHK': 1, 'bhk': 1, 'RK': 0, 'rk': 0}).fillna(0)

    # 5. Convert numeric columns
    df['Total_Area'] = pd.to_numeric(df['Total_Area'], errors='coerce')
    df['Price_per_SQFT'] = pd.to_numeric(df['Price_per_SQFT'], errors='coerce')

    # 6. Keep only Top 30 Localities
    locality_counts = df['Locality'].value_counts()
    top_localities = locality_counts.nlargest(30).index.tolist()
    df.loc[~df['Locality'].isin(top_localities), 'Locality'] = 'Other'

    # 7. Print summary
    print("Basic cleaning completed:")
    print(f"- Dataset shape: {df.shape}")
    print(f"- Valid price records: {df['Price_Lakhs'].notna().sum()}")
    print(f"- Unique cities: {df['City'].nunique()}")
    print(f"- Localities (including 'Other'): {df['Locality'].nunique()}")
    print(f"- Extracted BHK or RK listings: {df['bhk_or_rk'].notna().sum()}")
    print(f"- Extracted numeric BHK/RK values: {df['BHK'].notna().sum()}")

    return df


In [5]:
data_clean = data_cleaning(df)

Basic cleaning completed:
- Dataset shape: (14528, 17)
- Valid price records: 14528
- Unique cities: 8
- Localities (including 'Other'): 31
- Extracted BHK or RK listings: 14483
- Extracted numeric BHK/RK values: 14483


# Outlier Removal

In [6]:
def outlier_removal(df):
    df = df.copy()
    
    print(" Applying outlier removal...")
    
    def remove_outliers_iqr(series, factor=1.5):
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        return (series >= lower_bound) & (series <= upper_bound)
    
    initial_count = len(df)
    
    # Apply to key columns
    price_mask = remove_outliers_iqr(df['Price_Lakhs'].dropna())
    area_mask = remove_outliers_iqr(df['Total_Area'].dropna())
    pps_mask = remove_outliers_iqr(df['Price_per_SQFT'].dropna())
    
    # Combine masks
    valid_price_idx = df['Price_Lakhs'].dropna().index
    combined_mask = price_mask & area_mask & pps_mask
    outlier_idx = valid_price_idx[~combined_mask]
    
    df_clean = df.drop(outlier_idx)
    
    removed_count = initial_count - len(df_clean)
    print(f"    Removed {removed_count} outliers ({removed_count/initial_count*100:.1f}%)")
    print(f"    Final shape: {df_clean.shape}")
    
    return df_clean

# Apply proven outlier removal
data_clean = outlier_removal(data_clean)

 Applying outlier removal...
    Removed 2642 outliers (18.2%)
    Final shape: (11886, 17)


# Feature Engineering

In [7]:
def create_extra_features(df):
    df = df.copy()
    
    print(" Creating EXTRA advanced features...")
    
    # --- 1. Statistical transforms ---
    df['log_price'] = np.log1p(df['Price_Lakhs'])
    df['log_area'] = np.log1p(df['Total_Area'].clip(lower=1))
    df['sqrt_area'] = np.sqrt(df['Total_Area'].clip(lower=0))
    df['inv_area'] = 1 / np.maximum(df['Total_Area'], 1)
    
    # --- 2. Ratios & densities ---
    df['Area_per_Bath'] = df['Total_Area'] / np.maximum(df['Baths'], 1)
    df['Baths_per_BHK'] = df['Baths'] / np.maximum(df['BHK'], 1)
    df['Area_per_Room'] = df['Total_Area'] / np.maximum(df['BHK'] + df['Baths'], 1)
    df['log_area_per_room'] = np.log1p(df['Area_per_Room'])
    df['Bath_to_BHK_ratio'] = df['Baths'] / np.maximum(df['BHK'], 1)
    
    # --- 3. Locality/City level features ---
    df['Locality_Median_Price'] = df.groupby('Locality')["Price_Lakhs"].transform("median")
    df['Price_vs_Locality'] = df['Price_Lakhs'] / df['Locality_Median_Price']
    df['City_Median_Price'] = df.groupby('City')["Price_Lakhs"].transform("median")
    df['Price_vs_City'] = df['Price_Lakhs'] / df['City_Median_Price']
    
    # --- 4. Boolean / Flags ---
    df['is_BHK'] = (df['bhk_or_rk'].str.upper() == "BHK").astype(int)
    df['is_RK'] = (df['bhk_or_rk'].str.upper() == "RK").astype(int)
    df['is_Compact'] = (df['Total_Area'] < 500).astype(int)
    df['is_Luxury'] = (df['Total_Area'] > 2000).astype(int)
    df['Is_Premium_Size'] = (df['Total_Area'] > df['Total_Area'].median()).astype(int)
    df['Has_Multiple_Baths'] = (df['Baths'] >= 2).astype(int)
    df['Luxury_Score'] = df['is_Luxury'] + df['Has_Multiple_Baths'] + df['Balcony_flag']
    
    # --- 5. Text-derived features ---
    df['is_apartment'] = df['Property Title'].str.contains("apartment", case=False, na=False).astype(int)
    df['is_villa'] = df['Property Title'].str.contains("villa", case=False, na=False).astype(int)
    df['is_studio'] = df['Property Title'].str.contains("studio", case=False, na=False).astype(int)
    df['is_penthouse'] = df['Property Title'].str.contains("penthouse", case=False, na=False).astype(int)
    
    # --- 6. Interaction features ---
    df['Area_x_Price'] = df['Total_Area'] * df['Price_per_SQFT']
    df['Area_x_BHK'] = df['Total_Area'] * np.maximum(df['BHK'], 1)
    df['Area_x_Baths'] = df['Total_Area'] * np.maximum(df['Baths'], 1)
    df['log_Area_x_BHK'] = np.log1p(df['Area_x_BHK'])
    df['BHK_x_Baths'] = df['BHK'] * df['Baths']
    df['Balcony_x_BHK'] = df['Balcony_flag'] * df['BHK']
    
    # --- 7. Percentile based features ---
    df['Area_percentile'] = df['Total_Area'].rank(pct=True)
    df['Price_percentile'] = df['Price_Lakhs'].rank(pct=True)
    
    # --- 8. NEW: Missing features you need ---
    df['Price_per_Room'] = df['Price_per_SQFT'] * df['Area_per_Room']
    df['Total_Rooms'] = df['BHK'] + df['Baths']                       
    df['Area_Efficiency'] = df['Total_Area'] / np.maximum(df['Total_Rooms'], 1) 
    
    # Property size bins
    def categorize_property_size(area):
        if area < 500: return 'Compact'
        elif area < 1000: return 'Medium'
        elif area < 2000: return 'Large'
        else: return 'Luxury'
    
    df['Property_Size_Category'] = df['Total_Area'].apply(categorize_property_size)
    
    # BHK Category
    df['BHK_Category'] = df['BHK'].astype(str) + " " + df['bhk_or_rk'] 
    
    # --- 9. Combined luxury score ---
    def advanced_luxury_score(row):
        score = 0
        if row['Total_Area'] > 2000: score += 3
        elif row['Total_Area'] > 1200: score += 2
        elif row['Total_Area'] > 800: score += 1
        
        if row['BHK'] >= 4: score += 2
        elif row['BHK'] >= 3: score += 1
        
        if row['Baths'] >= 3: score += 2
        elif row['Baths'] >= 2: score += 1
        
        if row['Balcony_flag'] == 1: score += 1
        if row['is_villa'] or row['is_penthouse']: score += 2
        
        return score
    
    df['Advanced_Luxury_Score'] = df.apply(advanced_luxury_score, axis=1)
    
    print(f" Extra features created. Total columns now: {df.shape[1]}")
    
    return df


In [8]:
data_featured = create_extra_features(data_clean)

 Creating EXTRA advanced features...
 Extra features created. Total columns now: 55


In [9]:
data_featured.head(3)

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,Price_Lakhs,...,BHK_x_Baths,Balcony_x_BHK,Area_percentile,Price_percentile,Price_per_Room,Total_Rooms,Area_Efficiency,Property_Size_Category,BHK_Category,Advanced_Luxury_Score
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,199.0,...,16.0,4.0,0.997055,0.98616,2486137.5,8.0,322.875,Luxury,4.0 BHK,8
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No,100.0,...,9.0,0.0,0.786303,0.836572,1667600.0,6.0,220.0,Large,3.0 BHK,5
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,48.0,...,6.0,2.0,0.503407,0.421925,960000.0,5.0,192.0,Medium,2.0 BHK,4


In [10]:
numeric_features = [
    'log_area',              # log-transformed area
    'Baths',                 # number of bathrooms
    'Balcony_flag',          # balcony present or not
    'BHK_or_RK_flag',        # whether property is BHK or RK
    'log_area_per_room',
    'Bath_to_BHK_ratio',     # more bathrooms per BHK → luxury
    'Total_Rooms',           # BHK + Baths
    'Area_Efficiency',       # area per room       
    'Area_x_Baths',
    'log_Area_x_BHK',
    'Is_Premium_Size',       # large property flag
    'Has_Multiple_Baths',    # 2 or more baths         
    'Advanced_Luxury_Score',
    #'Price_per_Room'          # extended score
]

categorical_features = [
    'City',
    'Locality',
    'Property_Size_Category',
    'BHK_Category'
]

target_column = 'Price_Lakhs'


all_features = numeric_features + categorical_features

final_data = data_featured[all_features + [target_column]].copy()
final_data = final_data.dropna(subset=[target_column])

print(" Feature Selection for Advanced Algorithms:")
print(f"   Numeric features: {len(numeric_features)}")
print(f"   Categorical features: {len(categorical_features)}")
print(f"   Final dataset shape: {final_data.shape}")
print(f"   Missing values: {final_data.isnull().sum().sum()}")
print(f"   Sample size: {len(final_data)} properties")

final_data.head(3)


 Feature Selection for Advanced Algorithms:
   Numeric features: 13
   Categorical features: 4
   Final dataset shape: (11886, 18)
   Missing values: 150
   Sample size: 11886 properties


Unnamed: 0,log_area,Baths,Balcony_flag,BHK_or_RK_flag,log_area_per_room,Bath_to_BHK_ratio,Total_Rooms,Area_Efficiency,Area_x_Baths,log_Area_x_BHK,Is_Premium_Size,Has_Multiple_Baths,Advanced_Luxury_Score,City,Locality,Property_Size_Category,BHK_Category,Price_Lakhs
0,7.857094,4,1,1.0,5.780358,1.0,8.0,322.875,10332,9.243098,1,1,8,chennai,Other,Luxury,4.0 BHK,199.0
2,7.186144,3,0,1.0,5.398163,1.0,6.0,220.0,3960,8.284252,1,1,5,chennai,Other,Large,3.0 BHK,100.0
4,6.867974,3,1,1.0,5.26269,1.5,5.0,192.0,2880,7.560601,1,1,4,chennai,avadi,Medium,2.0 BHK,48.0


In [11]:
X = final_data.drop(columns=[target_column])
y = final_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,  
    random_state=42,
    stratify=X['City']
)

print(f" Train-Test Split:")
print(f"   Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"   Features: {X_train.shape[1]}")


 Train-Test Split:
   Training set: 9508 samples (80.0%)
   Test set: 2378 samples (20.0%)
   Features: 17


In [12]:
# Advanced Algorithm Data Preparation
print(" Preparing Advanced Algorithm Pipeline...")

categorical_features = ['City', 'Locality', 'Property_Size_Category', 'BHK_Category']
label_encoders = {}

# Encode categorical features
for feature in categorical_features:
    le = LabelEncoder()
    combined_values = pd.concat([X_train[feature], X_test[feature]], ignore_index=True)
    le.fit(combined_values.astype(str))
    
    # Encode train and test
    X_train[f'{feature}_encoded'] = le.transform(X_train[feature].astype(str))
    X_test[f'{feature}_encoded'] = le.transform(X_test[feature].astype(str))
    
    label_encoders[feature] = le  # store encoder

# Use only numeric + encoded features
features_to_use = [col for col in X_train.columns if col not in categorical_features]

X_train_processed = X_train[features_to_use].copy()
X_test_processed = X_test[features_to_use].copy()

# Convert any leftover object columns to numeric
for col in X_train_processed.columns:
    if X_train_processed[col].dtype == 'object':
        print(f"Converting {col} to numeric...")
        X_train_processed[col] = pd.to_numeric(X_train_processed[col], errors='coerce')
        X_test_processed[col] = pd.to_numeric(X_test_processed[col], errors='coerce')

# Fill missing values
X_train_processed = X_train_processed.fillna(0)
X_test_processed = X_test_processed.fillna(0)

print(f" Data preprocessing complete:")
print(f"   Final features: {len(features_to_use)}")
print(f"   Training shape: {X_train_processed.shape}")
print(f"   Test shape: {X_test_processed.shape}")

# Check if any object columns remain
object_cols = X_train_processed.select_dtypes(include=['object']).columns.tolist()
if object_cols:
    print("Remaining object columns:", object_cols)
else:
    print(" No object columns remain. Data is ready for ML models.")

 Preparing Advanced Algorithm Pipeline...
 Data preprocessing complete:
   Final features: 17
   Training shape: (9508, 17)
   Test shape: (2378, 17)
 No object columns remain. Data is ready for ML models.


In [13]:


string_cols = X_train_processed.select_dtypes(include=['object']).columns.tolist()

results = {}
    
models = {
    'Random_Forest': RandomForestRegressor(
        n_estimators=12,
        max_depth=2,
        min_samples_split=40,
        min_samples_leaf=20,
        max_features=0.15,
        random_state=42,
        bootstrap=True,
        oob_score=True,
        n_jobs=-1
    ),
    'Extra_Trees': ExtraTreesRegressor(
        n_estimators=15,
        max_depth=2,
        min_samples_split=30,
        min_samples_leaf=15,
        max_features=0.2,
        random_state=42,
        bootstrap=True,
        oob_score=True,
        n_jobs=-1
    ),
    'Gradient_Boosting': GradientBoostingRegressor(
        n_estimators=15,
        max_depth=2,
        min_samples_split=30,
        min_samples_leaf=20,
        learning_rate=0.03,
        subsample=0.5,
        max_features=0.2,
        random_state=42
    ),
    'AdaBoost': AdaBoostRegressor(
        n_estimators=10,
        learning_rate=0.3,
        loss='linear',
        random_state=42
    ),
    'Bagging': BaggingRegressor(
        n_estimators=15,
        max_samples=0.6,
        max_features=0.6,
        random_state=42,
        n_jobs=-1
    ),
    'XGBoost': XGBRegressor(
        n_estimators=15,
        max_depth=2,
        min_child_weight=15,
        learning_rate=0.03,
        subsample=0.5,
        colsample_bytree=0.2,
        reg_alpha=3.0,
        reg_lambda=3.0,
        random_state=42,
        n_jobs=-1
    ),
   'LightGBM': LGBMRegressor(
    n_estimators=15,
    max_depth=2,
    learning_rate=0.03,
    num_leaves=20,
    subsample=0.5,
    colsample_bytree=0.2,
    reg_alpha=3.0,
    reg_lambda=3.0,
    random_state=42,
    n_jobs=-1
)
}
    
for name, model in models.items():
    print(f"\n Training {name}...")
        
    model.fit(X_train_processed, y_train)

    y_train_pred = model.predict(X_train_processed)
    y_test_pred = model.predict(X_test_processed)
        
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        
    overfitting_gap = train_r2 - test_r2
    overfitting_pct = (overfitting_gap / train_r2) * 100
        
    results[name] = {
        'Train_R2': train_r2,
        'Test_R2': test_r2,
        'Train_MAE': train_mae,
        'Test_MAE': test_mae,
        'Train_RMSE': train_rmse,
        'Test_RMSE': test_rmse,
        'Overfitting_Gap_%': overfitting_pct
    }
        
    print(f" {name} Results:")
    print(f"   Train R²: {train_r2:.4f} ({train_r2*100:.1f}%)")
    print(f"   Test R²: {test_r2:.4f} ({test_r2*100:.1f}%)")
    print(f"   Overfitting Gap: {overfitting_pct:.2f}%")
    print(f"   Test MAE: ₹{test_mae:.2f}L")
    print(f"   Test RMSE: ₹{test_rmse:.2f}L")


 Training Random_Forest...
 Random_Forest Results:
   Train R²: 0.4750 (47.5%)
   Test R²: 0.4710 (47.1%)
   Overfitting Gap: 0.84%
   Test MAE: ₹22.01L
   Test RMSE: ₹29.52L

 Training Extra_Trees...


  warn(
  warn(


 Extra_Trees Results:
   Train R²: 0.3805 (38.1%)
   Test R²: 0.3817 (38.2%)
   Overfitting Gap: -0.32%
   Test MAE: ₹23.94L
   Test RMSE: ₹31.91L

 Training Gradient_Boosting...
 Gradient_Boosting Results:
   Train R²: 0.2680 (26.8%)
   Test R²: 0.2694 (26.9%)
   Overfitting Gap: -0.54%
   Test MAE: ₹26.64L
   Test RMSE: ₹34.69L

 Training AdaBoost...
 AdaBoost Results:
   Train R²: 0.5189 (51.9%)
   Test R²: 0.4996 (50.0%)
   Overfitting Gap: 3.71%
   Test MAE: ₹21.55L
   Test RMSE: ₹28.71L

 Training Bagging...
 Bagging Results:
   Train R²: 0.7323 (73.2%)
   Test R²: 0.5127 (51.3%)
   Overfitting Gap: 29.98%
   Test MAE: ₹20.24L
   Test RMSE: ₹28.33L

 Training XGBoost...
 XGBoost Results:
   Train R²: 0.2660 (26.6%)
   Test R²: 0.2686 (26.9%)
   Overfitting Gap: -0.99%
   Test MAE: ₹26.62L
   Test RMSE: ₹34.70L

 Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003454 seconds.
You can set `force_col_wise=true` to remove t

In [14]:
# Create comprehensive performance comparison
performance_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train_R2': [results[model]['Train_R2'] for model in results],
    'Test_R2': [results[model]['Test_R2'] for model in results],
    'Overfitting_Gap_%': [results[model]['Overfitting_Gap_%'] for model in results],
    'Test_MAE': [results[model]['Test_MAE'] for model in results],
    'Test_RMSE': [results[model]['Test_RMSE'] for model in results]
})


performance_df = performance_df.sort_values('Test_R2', ascending=False)

print(" ADVANCED ALGORITHM PERFORMANCE RANKING:")
print(performance_df)


best_model_name = performance_df.iloc[0]['Model']
best_results = results[best_model_name]

print(f"\n BEST PERFORMING MODEL: {best_model_name}")
print(f"   Train R²: {best_results['Train_R2']:.4f} ")
print(f"   Test R²: {best_results['Test_R2']:.4f} ")
print(f"   Overfitting Gap: {best_results['Overfitting_Gap_%']:.1f}%")
print(f"   Test MAE: ₹{best_results['Test_MAE']:.2f}L")
print(f"   Test RMSE: ₹{best_results['Test_RMSE']:.2f}L")

 ADVANCED ALGORITHM PERFORMANCE RANKING:
               Model  Train_R2   Test_R2  Overfitting_Gap_%   Test_MAE  \
4            Bagging  0.732251  0.512727          29.979319  20.235764   
3           AdaBoost  0.518869  0.499609           3.711907  21.545764   
0      Random_Forest  0.474962  0.470994           0.835295  22.011337   
1        Extra_Trees  0.380528  0.381748          -0.320764  23.937521   
2  Gradient_Boosting  0.267970  0.269423          -0.542003  26.640973   
5            XGBoost  0.266009  0.268634          -0.986947  26.624130   
6           LightGBM  0.264720  0.268556          -1.449350  26.663286   

   Test_RMSE  
4  28.327489  
3  28.706270  
0  29.515633  
1  31.908366  
2  34.686060  
5  34.704772  
6  34.706621  

 BEST PERFORMING MODEL: Bagging
   Train R²: 0.7323 
   Test R²: 0.5127 
   Overfitting Gap: 30.0%
   Test MAE: ₹20.24L
   Test RMSE: ₹28.33L


In [15]:
# Perform cross-validation on best models
print(f" Cross-Validation Analysis for Top 3 Models...")

top_3_models = performance_df.head(3)['Model'].tolist()

cv_results = {}
for model_name in top_3_models:
    print(f"\n {model_name} Cross-Validation:")
    
    # Get the model from the models dictionary
    model = models[model_name]
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring='r2', n_jobs=-1)
    
    cv_results[model_name] = {
        'scores': cv_scores,
        'mean': cv_scores.mean(),
        'std': cv_scores.std()
    }
    
    print(f"   CV R² Scores: {[f'{score:.4f}' for score in cv_scores]}")
    print(f"   Mean CV R²: {cv_scores.mean():.4f}")
    print(f"   CV Std Dev: {cv_scores.std():.4f}")
    print(f"   CV Range: {cv_scores.min():.4f} to {cv_scores.max():.4f}")
    
    # Stability assessment
    stability = "Excellent" if cv_scores.std() < 0.02 else "Good" if cv_scores.std() < 0.05 else "Moderate"
    print(f"   Stability: {stability}")
    
    # Compare with test performance
    test_cv_diff = abs(cv_scores.mean() - results[model_name]['Test_R2'])
    consistency = "Excellent" if test_cv_diff < 0.02 else "Good" if test_cv_diff < 0.05 else "Moderate"
    print(f"   CV vs Test consistency: {consistency} (diff: {test_cv_diff:.4f})")

# Summary of best performing model
best_cv_model = max(cv_results.keys(), key=lambda x: cv_results[x]['mean'])
print(f"\n BEST CV PERFORMANCE: {best_cv_model}")
print(f"   Mean CV R²: {cv_results[best_cv_model]['mean']:.4f}")
print(f"   CV Stability: {cv_results[best_cv_model]['std']:.4f}")

 Cross-Validation Analysis for Top 3 Models...

 Bagging Cross-Validation:
   CV R² Scores: ['0.5231', '0.5292', '0.5130', '0.5370', '0.5244']
   Mean CV R²: 0.5253
   CV Std Dev: 0.0078
   CV Range: 0.5130 to 0.5370
   Stability: Excellent
   CV vs Test consistency: Excellent (diff: 0.0126)

 AdaBoost Cross-Validation:
   CV R² Scores: ['0.4930', '0.5232', '0.4861', '0.5099', '0.5193']
   Mean CV R²: 0.5063
   CV Std Dev: 0.0145
   CV Range: 0.4861 to 0.5232
   Stability: Excellent
   CV vs Test consistency: Excellent (diff: 0.0067)

 Random_Forest Cross-Validation:
   CV R² Scores: ['0.4756', '0.4908', '0.4573', '0.4671', '0.4792']
   Mean CV R²: 0.4740
   CV Std Dev: 0.0113
   CV Range: 0.4573 to 0.4908
   Stability: Excellent
   CV vs Test consistency: Excellent (diff: 0.0030)

 BEST CV PERFORMANCE: Bagging
   Mean CV R²: 0.5253
   CV Stability: 0.0078
