In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer # Import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt



## Train/Test Split

We split the data BEFORE calculating fairness labels or performing scaling.
This ensures that the benchmarks for 'Fairly Priced' are derived only from the training distribution, preventing information from the test set leaking into the training process (Data Leakage).

## Define features (X) and the continuous target (y)
'y' is the raw price, which we will use to derive classification labels later.
Perform the split (80% Train, 20% Test)
We use random_state=42 for reproducibility of results.

In [None]:
# Train / Test Split (FIRST – prevents leakage)
X = data.drop(columns=['price'])
y = data['price']  # used ONLY for label creation, never as a feature

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)



# Section 3: MARKET CONTEXT FEATURES (FEATURE ENGINEERING)

We calculate the price distribution (Quartiles) per locality to establish "Local Market Norms." This allows the model to judge a property's price relative to its specific neighborhood rather than the entire country.

 1. Compute location-level price statistics using ONLY Training Data. We use lambda functions for 25th and 75th percentiles to define price boundaries.
 2. Integrate these benchmarks back into the datasets. We merge into BOTH train and test.
Note: X_test receives benchmarks derived from X_train to simulate a real-world scenario where the model encounters a new listing in a known locality.
  3. Handle Missing Values. If a locality in the Test set was NOT in the Training set, it will have NaNs.  We fill these with the global medians calculated from the training set.

In [None]:
# Market Context Features (Training Data Only)
# Compute location-level price statistics used for:
# - class label creation
# - contextual features
# These features capture local market price context and are used for:
# - price class label creation
# - relative pricing features
# Statistics are computed ONLY on training data to Prevent data leakage from test set


location_price_stats = (
    pd.DataFrame({'locality': X_train['locality'], 'price': y_train})
    .groupby('locality')['price']
    .agg(
        loc_q25=lambda x: x.quantile(0.25),
        loc_median='median',
        loc_q75=lambda x: x.quantile(0.75)
    )
    .reset_index()
)

# Merge into train and test
# Merge market context features into TRAIN data

X_train = X_train.merge(location_price_stats, on='locality', how='left')

# Merge SAME statistics into TEST data
# No recomputation → no leakage

X_test  = X_test.merge(location_price_stats, on='locality', how='left')

## 3a) Leakage Safe Target Label Generation

 We convert the continuous 'price' variable into categorical labels.

   0 = Underpriced (Price is in the bottom 25% of the local market)

   1 = Fairly priced (Price is in the middle 50% of the local market)

   2 = Overpriced (Price is in the top 25% of the local market)

In [None]:
# Create Classification Labels (Leakage-Safe)

# 0 = Underpriced
# 1 = Fairly priced
# 2 = Overpriced

def create_price_class(price, q25, q75):
    if price < q25:
        return 0
    elif price > q75:
        return 2
    else:
        return 1

y_train_cls = [
    create_price_class(p, q25, q75)
    for p, q25, q75 in zip(y_train, X_train['loc_q25'], X_train['loc_q75'])
]

y_test_cls = [
    create_price_class(p, q25, q75)
    for p, q25, q75 in zip(y_test, X_test['loc_q25'], X_test['loc_q75'])
]


## 3b) Feature Engineering ( Classification Aligned)

We create contextual features that translate raw data into "Appraisal Logic."
Note: All statistics (Medians, Means, Counts) are derived ONLY from Training data to maintain a strict wall against data leakage.

In [None]:
# These features encode RELATIVE price position and structural context, which align directly with: Underpriced / Fair / Overpriced classification
# Price (y) is used ONLY because target is derived from price
# No future or test aggregation is performed
# Relative price position (contextual, NOT residual-based)
# Measures how far a listing deviates from its local market
X_train['price_position'] = (y_train - X_train['loc_median']) / X_train['loc_median']
X_test['price_position']  = (y_test  - X_test['loc_median'])  / X_test['loc_median']


# Structural price normalization
# Captures whether a property is structurally above or below its local norm
X_train['price_per_bedroom']  = y_train / (X_train['bedrooms'] + 1)
X_test['price_per_bedroom']   = y_test  / (X_test['bedrooms'] + 1)

X_train['price_per_bathroom'] = y_train / (X_train['bathrooms'] + 1)
X_test['price_per_bathroom']  = y_test  / (X_test['bathrooms'] + 1)


# Structural deviation vs local average
location_structure_stats = (
    X_train
    .groupby('locality')[['bedrooms', 'bathrooms']]
    .mean()
    .reset_index()
    .rename(columns={
        'bedrooms': 'loc_avg_bedrooms',
        'bathrooms': 'loc_avg_bathrooms'
    })
)

# Merge TRAIN statistics into both sets
X_train = X_train.merge(location_structure_stats, on='locality', how='left')
X_test  = X_test.merge(location_structure_stats, on='locality', how='left')

X_train['bedroom_deviation']  = X_train['bedrooms']  - X_train['loc_avg_bedrooms']
X_test['bedroom_deviation']   = X_test['bedrooms']   - X_test['loc_avg_bedrooms']

# Deviation features
X_train['bathroom_deviation'] = X_train['bathrooms'] - X_train['loc_avg_bathrooms']
X_test['bathroom_deviation']  = X_test['bathrooms']  - X_test['loc_avg_bathrooms']


# Market density (thin vs thick markets)
# Approximates market liquidity:
# High density → stable pricing
# Low density → volatile pricing
location_density = X_train['locality'].value_counts(normalize=True)

X_train['location_density'] = X_train['locality'].map(location_density)
X_test['location_density']  = X_test['locality'].map(location_density)

## 3c) Drop Leakage Prone and Redundant columns

We now remove the raw benchmark columns and high-cardinality strings. Reasons for dropping:

 1. LEAKAGE: loc_q25/75 were used to define the Target. Including them in X would allow the model to "cheat" by learning the labeling rule.
 2. REDUNDANCY: The information in loc_avg_bedrooms/bathrooms is already captured in our 'deviation' features.
 3. OVERFITTING: Raw 'locality' names lead to high-dimensionality; we use

In [None]:
# The following columns were used ONLY as intermediate
# variables for:
# - label creation
# - contextual feature engineering
#
# Keeping them would allow the model to:
# - memorize market boundaries
# - indirectly reconstruct the target labels
#
# Therefore, they are removed before modeling.
# --------------------------------------------------
drop_cols = [
    'locality',
    'loc_q25',
    'loc_q75',
    'loc_median',
    'loc_avg_bedrooms',
    'loc_avg_bathrooms'
]

X_train_final = X_train.drop(columns=drop_cols)
X_test_final  = X_test.drop(columns=drop_cols)

## 3d) Automated Feature Type Detection

We segregate features by their data types to ensure they receive the correct preprocessing in the Scikit-Learn Pipeline.
    1. Categorical: Needs Encoding (e.g., One-Hot)
    2. Numerical: Needs Scaling (e.g., Standard Scaling)

Automatically identify Categorical columns (Strings/Objects/Categories)
Common examples in this dataset: 'type', 'category', 'furnished'

In [None]:
# Define Feature Types for Pipeline

categorical_features = X_train_final.select_dtypes(
    include=['object', 'category']
).columns.tolist()

numerical_features = X_train_final.select_dtypes(
    include=['int64', 'float64']
).columns.tolist()


## 3e) PreProcessing

We define a multi-path pipeline to handle different data types simultaneously. This ensures that our preprocessing is consistent across Train and Test sets.

In [None]:
# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')), # Impute NaNs for numerical features
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
         categorical_features)
    ]
)