In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Function to Sanitize and Standardize the Initial Dataset
def sanitize_real_estate_data(property_dataframe):
    dataframe = property_dataframe.copy()
    dataframe = dataframe.drop_duplicates()

    def convert_price_to_lakhs(price_text):

        if pd.isna(price_text): return np.nan
        price_text = str(price_text).strip().replace('₹', '').replace(',', '')

        if 'Cr' in price_text or 'cr' in price_text:
            numeric_val = re.findall(r'[\d.]+', price_text)
            if numeric_val: return float(numeric_val[0]) * 100
        elif 'L' in price_text or 'l' in price_text:
            numeric_val = re.findall(r'[\d.]+', price_text)
            if numeric_val: return float(numeric_val[0])
        else:
            numeric_val = re.findall(r'[\d.]+', price_text)
            if numeric_val:
                value = float(numeric_val[0])
                return value / 100000 if value > 10000 else value
        return np.nan

    dataframe['Price_in_Lakhs'] = dataframe['Price'].apply(convert_price_to_lakhs)
    dataframe['Location_Details'] = dataframe['Location'].str.split(',')
    dataframe['Neighborhood'] = dataframe['Location_Details'].apply(lambda loc: loc[0].strip() if loc and len(loc) > 0 else 'Unknown')
    dataframe['City_Name'] = dataframe['Location_Details'].apply(lambda loc: loc[-1].strip() if loc and len(loc) > 0 else 'Unknown')

    def get_bhk_from_title(listing_title):
        """Extracts the number of bedrooms (BHK) from the property title."""
        if pd.isna(listing_title): return 0
        match = re.search(r'(\d+)\s*BHK', str(listing_title), re.IGNORECASE)
        return int(match.group(1)) if match else 0

    dataframe['BHK'] = dataframe['Property Title'].apply(get_bhk_from_title)
    dataframe['Has_Balcony'] = dataframe['Balcony'].map({'Yes': 1, 'Y': 1, 'No': 0, 'N': 0}).fillna(0)
    dataframe['Total_Area'] = pd.to_numeric(dataframe['Total_Area'], errors='coerce')
    dataframe['Price_per_SQFT'] = pd.to_numeric(dataframe['Price_per_SQFT'], errors='coerce')
    dataframe['Baths'] = pd.to_numeric(dataframe['Baths'], errors='coerce').fillna(1)
    dataframe['Total_Area'] = dataframe['Total_Area'].fillna(dataframe.groupby('BHK')['Total_Area'].transform('median'))

    neighborhood_counts = dataframe['Neighborhood'].value_counts()
    top_neighborhoods = neighborhood_counts.nlargest(30).index.tolist()
    dataframe.loc[~dataframe['Neighborhood'].isin(top_neighborhoods), 'Neighborhood'] = 'Other'
    return dataframe


# Function to Remove Statistical Outliers
def prune_extreme_values(property_dataframe):
    """Removes outliers based on the Interquartile Range (IQR) method."""
    dataframe = property_dataframe.copy()

    def filter_by_iqr(data_series, multiplier=1.5):
        """Identifies values within the acceptable IQR range."""
        q1 = data_series.quantile(0.25)
        q3 = data_series.quantile(0.75)
        iqr = q3 - q1
        lower_threshold = q1 - multiplier * iqr
        upper_threshold = q3 + multiplier * iqr
        return (data_series >= lower_threshold) & (data_series <= upper_threshold)

    price_filter = filter_by_iqr(dataframe['Price_in_Lakhs'].dropna())
    area_filter = filter_by_iqr(dataframe['Total_Area'].dropna())
    pps_filter = filter_by_iqr(dataframe['Price_per_SQFT'].dropna())

    valid_indices = dataframe['Price_in_Lakhs'].dropna().index
    final_filter = price_filter & area_filter & pps_filter
    outlier_indices = valid_indices[~final_filter]

    pruned_dataframe = dataframe.drop(outlier_indices)
    return pruned_dataframe

# Function to Create New Predictive Features
def engineer_new_features(property_dataframe):
    """Creates new features from existing data to improve model accuracy."""
    dataframe = property_dataframe.copy()
    dataframe['log_area'] = np.log1p(dataframe['Total_Area'])
    dataframe['Area_per_Room'] = dataframe['Total_Area'] / np.maximum(dataframe['BHK'], 1)
    dataframe['log_area_per_room'] = np.log1p(dataframe['Area_per_Room'])
    dataframe['Bath_to_BHK_ratio'] = dataframe['Baths'] / np.maximum(dataframe['BHK'], 1)
    dataframe['Total_Rooms'] = dataframe['BHK'] + dataframe['Baths']
    dataframe['Area_Efficiency'] = dataframe['Total_Area'] / np.maximum(dataframe['Total_Rooms'], 1)
    dataframe['Area_x_BHK'] = dataframe['Total_Area'] * dataframe['BHK']
    dataframe['Area_x_Baths'] = dataframe['Total_Area'] * dataframe['Baths']
    dataframe['log_Area_x_BHK'] = np.log1p(dataframe['Area_x_BHK'])

    def classify_property_size(area):
        if area < 500: return 'Compact'
        elif area < 1000: return 'Medium'
        elif area < 2000: return 'Large'
        else: return 'Luxury'
    dataframe['Property_Size_Category'] = dataframe['Total_Area'].apply(classify_property_size)

    def classify_bhk_count(bhk):
        if bhk <= 1: return '1BHK'
        elif bhk <= 2: return '2BHK'
        elif bhk <= 3: return '3BHK'
        else: return '4+BHK'
    dataframe['BHK_Category'] = dataframe['BHK'].apply(classify_bhk_count)

    dataframe['Price_per_Room'] = dataframe['Price_per_SQFT'] * dataframe['Area_per_Room']
    dataframe['Is_Premium_Size'] = (dataframe['Total_Area'] > dataframe['Total_Area'].quantile(0.75)).astype(int)
    dataframe['Has_Multiple_Baths'] = (dataframe['Baths'] >= 2).astype(int)

    def create_luxury_rating(property_row):
        rating = 0
        if property_row['Total_Area'] > 1500: rating += 2
        elif property_row['Total_Area'] > 1000: rating += 1
        if property_row['BHK'] >= 4: rating += 2
        elif property_row['BHK'] >= 3: rating += 1
        if property_row['Baths'] >= 3: rating += 1
        if property_row['Has_Balcony']: rating += 1
        return rating
    dataframe['Luxury_Score'] = dataframe.apply(create_luxury_rating, axis=1)
    return dataframe

# Main Script Execution
if __name__ == '__main__':
    # Data Loading and Full Preprocessing Pipeline
    raw_listings_data = pd.read_csv("Real Estate Data V21.csv")
    sanitized_listings = sanitize_real_estate_data(raw_listings_data)
    filtered_listings = prune_extreme_values(sanitized_listings)
    enriched_listings = engineer_new_features(filtered_listings)

    # Feature Selection for the Model
    numerical_attributes = [
        'log_area', 'Baths', 'Has_Balcony', 'BHK', 'log_area_per_room',
        'Bath_to_BHK_ratio', 'Total_Rooms', 'Area_Efficiency', 'Area_x_Baths',
        'log_Area_x_BHK', 'Price_per_Room', 'Is_Premium_Size',
        'Has_Multiple_Baths', 'Luxury_Score'
    ]
    categorical_attributes = [
        'City_Name', 'Neighborhood', 'Property_Size_Category', 'BHK_Category'
    ]
    prediction_target_name = 'Price_in_Lakhs'

    all_model_attributes = numerical_attributes + categorical_attributes
    model_input_dataframe = enriched_listings[all_model_attributes + [prediction_target_name]].dropna(subset=[prediction_target_name])

    features = model_input_dataframe.drop(columns=[prediction_target_name])
    target = model_input_dataframe[prediction_target_name]

    # Splitting Data into Training and Testing Sets
    training_features, testing_features, training_target, testing_target = train_test_split(
        features, target, test_size=0.2, random_state=42, stratify=features['City_Name']
    )

    # Encoding Categorical Features
    for attribute in categorical_attributes:
        encoder = LabelEncoder()
        all_categories = pd.concat([training_features[attribute], testing_features[attribute]], ignore_index=True)
        encoder.fit(all_categories.astype(str))
        training_features[f'{attribute}_encoded'] = encoder.transform(training_features[attribute].astype(str))
        testing_features[f'{attribute}_encoded'] = encoder.transform(testing_features[attribute].astype(str))

    # Select only the encoded and numerical columns for the model
    final_model_columns = [col for col in training_features.columns if col not in categorical_attributes]
    training_features_processed = training_features[final_model_columns].copy().fillna(0)
    testing_features_processed = testing_features[final_model_columns].copy().fillna(0)

    # Scale the numerical features
    scaler = StandardScaler()
    training_features_scaled = scaler.fit_transform(training_features_processed)
    testing_features_scaled = scaler.transform(testing_features_processed)

    # Support Vector Regressor (SVR) model
    svr_base_model = SVR(kernel='linear')

    # Define the Bagging Regressor with the SVR base
    property_price_estimator = BaggingRegressor(
        estimator=svr_base_model,
        n_estimators=50,
        max_samples=0.8,
        random_state=42,
        n_jobs=-1
    )
    # Train the model on the SCALED data
    property_price_estimator.fit(training_features_scaled, training_target)

    # Predict on both test and train sets to check for overfitting
    price_predictions = property_price_estimator.predict(testing_features_scaled)
    train_price_predictions = property_price_estimator.predict(training_features_scaled)

    # Calculate all relevant metrics
    train_r2_metric = r2_score(training_target, train_price_predictions)
    test_r2_metric = r2_score(testing_target, price_predictions)
    mae_metric = mean_absolute_error(testing_target, price_predictions)
    rmse_metric = np.sqrt(mean_squared_error(testing_target, price_predictions))


    print(f"   Train R-squared (R²): {train_r2_metric:.4f} ({train_r2_metric*100:.1f}%)")
    print(f"   Test R-squared (R²):  {test_r2_metric:.4f} ({test_r2_metric*100:.1f}%)")
    print(f"   Overfitting Gap:      {(train_r2_metric - test_r2_metric)*100:.2f}%")
    print(f"   Mean Absolute Error (MAE): ₹{mae_metric:.2f} Lakhs")
    print(f"   Root Mean Squared Error (RMSE): ₹{rmse_metric:.2f} Lakhs")
    price_predictions = property_price_estimator.predict(testing_features_scaled)
print("\n--- Actual vs. Predicted Price Comparison ---\n")
comparison_df = pd.DataFrame({
    'Actual Value (Lakhs)': testing_target,
    'Predicted Value (Lakhs)': price_predictions
})

# Calculate the gap
comparison_df['Gap (Lakhs)'] = comparison_df['Actual Value (Lakhs)'] - comparison_df['Predicted Value (Lakhs)']

# Round the values for cleaner output
comparison_df = comparison_df.round(2)

# Display the first 15 results
print(comparison_df.head(15).to_string())

   Train R-squared (R²): 0.8735 (87.3%)
   Test R-squared (R²):  0.8726 (87.3%)
   Overfitting Gap:      0.09%
   Mean Absolute Error (MAE): ₹7.43 Lakhs
   Root Mean Squared Error (RMSE): ₹14.67 Lakhs

--- Actual vs. Predicted Price Comparison ---

       Actual Value (Lakhs)  Predicted Value (Lakhs)  Gap (Lakhs)
8894                   40.0                    40.04        -0.04
10196                  50.0                    68.63       -18.63
8305                   45.0                    44.04         0.96
5369                  110.0                   118.63        -8.63
6429                   61.0                    79.07       -18.07
13728                 100.0                    99.93         0.07
6828                   55.0                    55.13        -0.13
199                    40.0                    40.07        -0.07
11182                  90.0                    87.18         2.82
3022                   56.0                    56.01        -0.01
8979                  160

In [None]:


import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score

# --- Step 1: Define a Custom Transformer for all your preprocessing ---
class RealEstateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.top_neighborhoods = None

    def fit(self, X, y=None):
        # Learn the top neighborhoods from the training data
        temp_df = X.copy()
        temp_df['Location_Details'] = temp_df['Location'].str.split(',')
        temp_df['Neighborhood'] = temp_df['Location_Details'].apply(lambda loc: loc[0].strip() if loc and len(loc) > 0 else 'Unknown')
        neighborhood_counts = temp_df['Neighborhood'].value_counts()
        self.top_neighborhoods = neighborhood_counts.nlargest(30).index.tolist()
        return self

    def transform(self, X, y=None):
        df = X.copy()

        # === A: Data Sanitization (from your functions) ===
        def get_bhk_from_title(listing_title):
            if pd.isna(listing_title): return 0
            match = re.search(r'(\d+)\s*BHK', str(listing_title), re.IGNORECASE)
            return int(match.group(1)) if match else 0
        df['BHK'] = df['Property Title'].apply(get_bhk_from_title)
        df['Has_Balcony'] = df['Balcony'].map({'Yes': 1, 'Y': 1, 'No': 0, 'N': 0}).fillna(0)
        df['Total_Area'] = pd.to_numeric(df['Total_Area'], errors='coerce')
        df['Price_per_SQFT'] = pd.to_numeric(df['Price_per_SQFT'], errors='coerce')
        df['Baths'] = pd.to_numeric(df['Baths'], errors='coerce').fillna(1)
        df['Total_Area'] = df['Total_Area'].fillna(df.groupby('BHK')['Total_Area'].transform('median'))

        df['Location_Details'] = df['Location'].str.split(',')
        df['Neighborhood'] = df['Location_Details'].apply(lambda loc: loc[0].strip() if loc and len(loc) > 0 else 'Unknown')
        df['City_Name'] = df['Location_Details'].apply(lambda loc: loc[-1].strip() if loc and len(loc) > 0 else 'Unknown')
        df.loc[~df['Neighborhood'].isin(self.top_neighborhoods), 'Neighborhood'] = 'Other'

        # === B: Feature Engineering (from your functions) ===
        df['log_area'] = np.log1p(df['Total_Area'])
        df['Area_per_Room'] = df['Total_Area'] / np.maximum(df['BHK'], 1)
        df['log_area_per_room'] = np.log1p(df['Area_per_Room'])
        df['Bath_to_BHK_ratio'] = df['Baths'] / np.maximum(df['BHK'], 1)
        df['Total_Rooms'] = df['BHK'] + df['Baths']
        df['Area_Efficiency'] = df['Total_Area'] / np.maximum(df['Total_Rooms'], 1)
        df['Area_x_BHK'] = df['Total_Area'] * df['BHK']
        df['Area_x_Baths'] = df['Total_Area'] * df['Baths']
        df['log_Area_x_BHK'] = np.log1p(df['Area_x_BHK'])

        # --- SYNTAX FIX IS HERE ---
        def classify_property_size(area):
            if area < 500:
                return 'Compact'
            elif area < 1000:
                return 'Medium'
            elif area < 2000:
                return 'Large'
            else:
                return 'Luxury'
        df['Property_Size_Category'] = df['Total_Area'].apply(classify_property_size)

        # --- AND HERE ---
        def classify_bhk_count(bhk):
            if bhk <= 1:
                return '1BHK'
            elif bhk <= 2:
                return '2BHK'
            elif bhk <= 3:
                return '3BHK'
            else:
                return '4+BHK'
        df['BHK_Category'] = df['BHK'].apply(classify_bhk_count)

        df['Price_per_Room'] = df['Price_per_SQFT'] * df['Area_per_Room']
        df['Is_Premium_Size'] = (df['Total_Area'] > df['Total_Area'].quantile(0.75)).astype(int)
        df['Has_Multiple_Baths'] = (df['Baths'] >= 2).astype(int)

        def create_luxury_rating(row):
            rating = 0
            if row['Total_Area'] > 1500: rating += 2
            elif row['Total_Area'] > 1000: rating += 1
            if row['BHK'] >= 4: rating += 2
            elif row['BHK'] >= 3: rating += 1
            if row['Baths'] >= 3: rating += 1
            if row['Has_Balcony']: rating += 1
            return rating
        df['Luxury_Score'] = df.apply(create_luxury_rating, axis=1)

        return df

# --- Step 2: Define Helper Functions for initial cleaning ---
def convert_price_to_lakhs(price_text):
    if pd.isna(price_text): return np.nan
    price_text = str(price_text).strip().replace('₹', '').replace(',', '')
    if 'Cr' in price_text or 'cr' in price_text:
        numeric_val = re.findall(r'[\d.]+', price_text); return float(numeric_val[0]) * 100 if numeric_val else np.nan
    elif 'L' in price_text or 'l' in price_text:
        numeric_val = re.findall(r'[\d.]+', price_text); return float(numeric_val[0]) if numeric_val else np.nan
    return np.nan

def prune_extreme_values(df):
    df_copy = df.copy().dropna(subset=['Price_in_Lakhs', 'Total_Area', 'Price_per_SQFT'])
    for col in ['Price_in_Lakhs', 'Total_Area', 'Price_per_SQFT']:
        q1 = df_copy[col].quantile(0.25); q3 = df_copy[col].quantile(0.75); iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr; upper_bound = q3 + 1.5 * iqr
        df_copy = df_copy[(df_copy[col] >= lower_bound) & (df_copy[col] <= upper_bound)]
    return df_copy

# --- Main Execution Block ---
if __name__ == '__main__':
    # Load and perform initial cleaning (outlier removal is done before splitting)
    raw_data = pd.read_csv("Real Estate Data V21.csv")
    raw_data['Price_in_Lakhs'] = raw_data['Price'].apply(convert_price_to_lakhs)
    cleaned_data = prune_extreme_values(raw_data)
    cleaned_data.dropna(subset=['Price_in_Lakhs'], inplace=True)

    # Define Features and Target, then Split Data
    features = cleaned_data.drop(columns=['Price_in_Lakhs'])
    target = cleaned_data['Price_in_Lakhs']
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    # Define the scikit-learn Pipeline
    numerical_features = ['log_area', 'Baths', 'Has_Balcony', 'BHK', 'log_area_per_room', 'Bath_to_BHK_ratio', 'Total_Rooms', 'Area_Efficiency', 'Area_x_BHK', 'Area_x_Baths', 'log_Area_x_BHK', 'Price_per_Room', 'Is_Premium_Size', 'Has_Multiple_Baths', 'Luxury_Score']
    categorical_features = ['City_Name', 'Neighborhood', 'Property_Size_Category', 'BHK_Category']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features)
        ],
        remainder='drop'
    )

    final_model = BaggingRegressor(
        estimator=DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_leaf=5),
        n_estimators=50, max_samples=0.75, max_features=0.75, random_state=42, n_jobs=-1
    )

    main_pipeline = Pipeline(steps=[
        ('custom_transformer', RealEstateTransformer()),
        ('preprocessor', preprocessor),
        ('regressor', final_model)
    ])

    # Train the Pipeline
    print("Training the complete pipeline...")
    main_pipeline.fit(X_train, y_train)
    print("Training complete.")

    # Evaluate the pipeline on the test set to show performance
    y_pred = main_pipeline.predict(X_test)
    test_r2 = r2_score(y_test, y_pred)
    print(f"\nPipeline Test R-squared (R²): {test_r2:.4f}")

    # Save the Trained Pipeline
    pipeline_filename = 'property_price_pipeline.joblib'
    joblib.dump(main_pipeline, pipeline_filename)
    print(f"✅ Pipeline saved successfully as '{pipeline_filename}'")

Training the complete pipeline...
Training complete.

Pipeline Test R-squared (R²): 0.9545
✅ Pipeline saved successfully as 'property_price_pipeline.joblib'


In [None]:
# predict_unseen.py

import pandas as pd
import numpy as np
import re
import joblib
from sklearn.base import BaseEstimator, TransformerMixin

# --- You MUST redefine the custom transformer class here ---
class RealEstateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.top_neighborhoods = None

    def fit(self, X, y=None):
        temp_df = X.copy()
        temp_df['Location_Details'] = temp_df['Location'].str.split(',')
        temp_df['Neighborhood'] = temp_df['Location_Details'].apply(lambda loc: loc[0].strip() if loc and len(loc) > 0 else 'Unknown')
        neighborhood_counts = temp_df['Neighborhood'].value_counts()
        self.top_neighborhoods = neighborhood_counts.nlargest(30).index.tolist()
        return self

    def transform(self, X, y=None):
        df = X.copy()
        def get_bhk_from_title(listing_title):
            if pd.isna(listing_title): return 0
            match = re.search(r'(\d+)\s*BHK', str(listing_title), re.IGNORECASE)
            return int(match.group(1)) if match else 0
        df['BHK'] = df['Property Title'].apply(get_bhk_from_title)
        df['Has_Balcony'] = df['Balcony'].map({'Yes': 1, 'Y': 1, 'No': 0, 'N': 0}).fillna(0)
        df['Total_Area'] = pd.to_numeric(df['Total_Area'], errors='coerce')
        df['Price_per_SQFT'] = pd.to_numeric(df['Price_per_SQFT'], errors='coerce')
        df['Baths'] = pd.to_numeric(df['Baths'], errors='coerce').fillna(1)
        df['Total_Area'] = df['Total_Area'].fillna(df.groupby('BHK')['Total_Area'].transform('median'))
        df['Location_Details'] = df['Location'].str.split(',')
        df['Neighborhood'] = df['Location_Details'].apply(lambda loc: loc[0].strip() if loc and len(loc) > 0 else 'Unknown')
        df['City_Name'] = df['Location_Details'].apply(lambda loc: loc[-1].strip() if loc and len(loc) > 0 else 'Unknown')
        df.loc[~df['Neighborhood'].isin(self.top_neighborhoods), 'Neighborhood'] = 'Other'
        df['log_area'] = np.log1p(df['Total_Area'])
        df['Area_per_Room'] = df['Total_Area'] / np.maximum(df['BHK'], 1)
        df['log_area_per_room'] = np.log1p(df['Area_per_Room'])
        df['Bath_to_BHK_ratio'] = df['Baths'] / np.maximum(df['BHK'], 1)
        df['Total_Rooms'] = df['BHK'] + df['Baths']
        df['Area_Efficiency'] = df['Total_Area'] / np.maximum(df['Total_Rooms'], 1)
        df['Area_x_BHK'] = df['Total_Area'] * df['BHK']
        df['Area_x_Baths'] = df['Total_Area'] * df['Baths']
        df['log_Area_x_BHK'] = np.log1p(df['Area_x_BHK'])

        # --- SYNTAX FIX IS HERE ---
        def classify_property_size(area):
            if area < 500:
                return 'Compact'
            elif area < 1000:
                return 'Medium'
            elif area < 2000:
                return 'Large'
            else:
                return 'Luxury'
        df['Property_Size_Category'] = df['Total_Area'].apply(classify_property_size)

        # --- AND HERE ---
        def classify_bhk_count(bhk):
            if bhk <= 1:
                return '1BHK'
            elif bhk <= 2:
                return '2BHK'
            elif bhk <= 3:
                return '3BHK'
            else:
                return '4+BHK'
        df['BHK_Category'] = df['BHK'].apply(classify_bhk_count)

        df['Price_per_Room'] = df['Price_per_SQFT'] * df['Area_per_Room']
        # For unseen data, we use a fixed value for quantiles instead of recalculating
        df['Is_Premium_Size'] = (df['Total_Area'] > 1600).astype(int)
        df['Has_Multiple_Baths'] = (df['Baths'] >= 2).astype(int)

        def create_luxury_rating(row):
            rating = 0
            if row['Total_Area'] > 1500: rating += 2
            elif row['Total_Area'] > 1000: rating += 1
            if row['BHK'] >= 4: rating += 2
            elif row['BHK'] >= 3: rating += 1
            if row['Baths'] >= 3: rating += 1
            if row['Has_Balcony']: rating += 1
            return rating
        df['Luxury_Score'] = df.apply(create_luxury_rating, axis=1)
        return df

# --- Main Execution Block ---
if __name__ == '__main__':
    # Load the saved pipeline
    pipeline_filename = 'property_price_pipeline.joblib'
    loaded_pipeline = joblib.load(pipeline_filename)
    print("✅ Pipeline loaded successfully!")

    # Create a DataFrame with new, unseen data
    unseen_data = pd.DataFrame({
        'Property Title': ['2 BHK Apartment in a Gated Community', '4 BHK Independent Villa'],
        'Price': ['95 L', '3.2 Cr'],
        'Location': ['Velachery, Chennai', 'Gachibowli, Hyderabad'],
        'Total_Area': [1150, 3400],
        'Price_per_SQFT': [8260, 9411],
        'Baths': [2, 4],
        'Balcony': ['Yes', 'Yes']
    })

    # Use the loaded pipeline to make predictions
    predicted_prices = loaded_pipeline.predict(unseen_data)

    # Display the results
    print("\n--- Predictions on Unseen Data ---")
    for i, prediction in enumerate(predicted_prices):
        print(f"For property: '{unseen_data['Property Title'][i]}'")
        print(f"   -> Predicted Price: ₹{prediction:.2f} Lakhs\n")

✅ Pipeline loaded successfully!

--- Predictions on Unseen Data ---
For property: '2 BHK Apartment in a Gated Community'
   -> Predicted Price: ₹86.95 Lakhs

For property: '4 BHK Independent Villa'
   -> Predicted Price: ₹185.52 Lakhs

