In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report
import joblib
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# Set MLflow experiment name
mlflow.set_experiment("Real_Estate_Advisor_Experiments")

<Experiment: artifact_location='file:///C:/Users/comra/Dev/Real-Estate-Investment-Advisor/mlruns/628623914471313771', creation_time=1765540787120, experiment_id='628623914471313771', last_update_time=1765540787120, lifecycle_stage='active', name='Real_Estate_Advisor_Experiments', tags={}>

In [2]:
# Load the processed data
df = pd.read_csv('data/processed_housing_data.csv')
df.head()

Unnamed: 0,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,Furnished_Status,...,Security,Amenities,Facing,Owner_Type,Availability_Status,Transport_Score,Infra_Score,Good_Investment,Avg_Room_Size,Floor_Ratio
0,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,Semi-Furnished,...,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_To_Move,1,19,0,1821.0,0.703704
1,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,Furnished,...,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_To_Move,3,18,0,1370.5,0.807692
2,Jharkhand,Ranchi,Locality_122,Villa,2,1393,187.42,0.13,2011,Unfurnished,...,No,"Clubhouse, Playground, Garden, Gym",East,Owner,Ready_To_Move,3,12,1,696.5,0.461538
3,Telangana,Warangal,Locality_75,Independent House,1,665,324.24,0.49,1991,Semi-Furnished,...,Yes,Clubhouse,North,Broker,Under_Construction,1,11,0,665.0,0.666667
4,Karnataka,Bangalore,Locality_462,Apartment,5,3988,465.38,0.12,2022,Furnished,...,Yes,"Clubhouse, Pool, Playground, Gym",West,Broker,Ready_To_Move,3,16,1,797.6,0.25


In [24]:
df['City_Median_Price'] = df.groupby('City')['Price_per_SqFt'].transform('median')

df['Price_Relative_Status'] = df['Price_per_SqFt'] / df['City_Median_Price']
    
# Target A: Future Price (Regression)
# Simulation: Current Price * (1 + 8% growth)^5 + volatility
np.random.seed(42)
growth_rate = 0.08
volatility = np.random.uniform(-0.02, 0.02, size=len(df))
df['Future_Price_5Y'] = df['Price_in_Lakhs'] * ((1 + growth_rate + volatility) ** 5)

In [25]:
df.head()

Unnamed: 0,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,Furnished_Status,...,Owner_Type,Availability_Status,Transport_Score,Infra_Score,Good_Investment,Avg_Room_Size,Floor_Ratio,Future_Price_5Y,City_Median_Price,Price_Relative_Status
0,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,Semi-Furnished,...,Broker,Ready_To_Move,1,19,0,1821.0,0.703704,263.831739,0.09,0.555556
1,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,Furnished,...,Builder,Ready_To_Move,3,18,0,1370.5,0.807692,479.301818,0.1,1.1
2,Jharkhand,Ranchi,Locality_122,Villa,2,1393,187.42,0.13,2011,Unfurnished,...,Owner,Ready_To_Move,3,12,1,696.5,0.461538,287.417428,0.1,1.3
3,Telangana,Warangal,Locality_75,Independent House,1,665,324.24,0.49,1991,Semi-Furnished,...,Broker,Under_Construction,1,11,0,665.0,0.666667,485.182922,0.09,5.444444
4,Karnataka,Bangalore,Locality_462,Apartment,5,3988,465.38,0.12,2022,Furnished,...,Broker,Ready_To_Move,3,16,1,797.6,0.25,641.333741,0.09,1.333333


In [35]:
features = ['City', 'Locality', 'Property_Type', 'Furnished_Status', 'Public_Transport_Accessibility', 
            'BHK', 'Price_in_Lakhs', 'Size_in_SqFt', 'Price_per_SqFt', 'City_Median_Price', 'Floor_No', 'Total_Floors', 'Age_of_Property', 
            'Nearby_Schools', 'Nearby_Hospitals', 'Infra_Score', 'Avg_Room_Size', 'Floor_Ratio']

X = df[features]
y_class = df['Good_Investment']   # Target for Classification
y_reg = df['Future_Price_5Y']     # Target for Regression

# Split
# We use the same random_state so X_train is consistent for both models
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=42
)

print(f"Training Set: {X_train.shape}")
print(f"Test Set: {X_test.shape}")

Training Set: (105868, 18)
Test Set: (26468, 18)


In [36]:
numeric_features = ['BHK', 'Price_in_Lakhs', 'Size_in_SqFt', 'Price_per_SqFt', 'Floor_No', 'Total_Floors', 
                    'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals', 'City_Median_Price', 
                    'Infra_Score', 'Avg_Room_Size', 'Floor_Ratio']

# Nominal Categorical (No order, Low Cardinality): One-Hot Encoding
nominal_features = ['City', 'Property_Type']

# High Cardinality Categorical: Target Encoding
# Locality has too many unique values for One-Hot. Target Encoding is best here.
target_features = ['Locality']

# Ordinal Categorical (Order matters): Ordinal Encoding
ordinal_features = ['Furnished_Status', 'Public_Transport_Accessibility']

# Numeric: Impute missing -> Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Nominal: Impute -> OneHot
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Target: Target Encoding
# target_type='auto' automatically detects if we are doing regression (continuous) or classification (binary)
target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('target', TargetEncoder(target_type='auto', smooth='auto'))
])

# Ordinal: Define Order explicitly
furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
transport_order = ['Low', 'Medium', 'High']

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[furnished_order, transport_order], 
                               handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine into Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('nom', nominal_transformer, nominal_features),
        ('target', target_transformer, target_features),
        ('ord', ordinal_transformer, ordinal_features)
    ])

In [44]:
print("--- Training Classification Model 1: Random Forest ---")

rf_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42))
])

with mlflow.start_run(run_name="RF_Classifier"):
    rf_clf_pipeline.fit(X_train, y_class_train)
    y_pred = rf_clf_pipeline.predict(X_test)
    
    rf_acc = accuracy_score(y_class_test, y_pred)
    rf_f1 = f1_score(y_class_test, y_pred)

    input_example = X_train.iloc[:1].copy()
    for col in input_example.select_dtypes(include=['int64', 'int32']).columns:
        input_example[col] = input_example[col].astype(float)
    signature = infer_signature(input_example, y_pred)

    mlflow.log_metric("accuracy", rf_acc)
    mlflow.log_metric("f1_score", rf_f1)
    mlflow.sklearn.log_model(rf_clf_pipeline, name = "model_rf_clf", signature=signature, input_example=input_example)

    print(f"Random Forest Accuracy: {rf_acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_class_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_class_test, y_pred))

--- Training Classification Model 1: Random Forest ---


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Random Forest Accuracy: 0.9989

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16542
           1       1.00      1.00      1.00      9926

    accuracy                           1.00     26468
   macro avg       1.00      1.00      1.00     26468
weighted avg       1.00      1.00      1.00     26468


Confusion Matrix:
[[16542     0]
 [   30  9896]]


In [46]:
print("--- Training Classification Model 2: Gradient Boosting ---")


gb_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))
])

# 2. Train & Track
with mlflow.start_run(run_name="GB_Classifier"):
    gb_clf_pipeline.fit(X_train, y_class_train)
    y_pred = gb_clf_pipeline.predict(X_test)
    
    # Calculate Metrics
    gb_acc = accuracy_score(y_class_test, y_pred)
    gb_f1 = f1_score(y_class_test, y_pred)
    
    # MLflow Logging
    input_example = X_train.iloc[:1].copy()
    for col in input_example.select_dtypes(include=['int64', 'int32']).columns:
        input_example[col] = input_example[col].astype(float)
    signature = infer_signature(input_example, y_pred)
    
    mlflow.log_metric("accuracy", gb_acc)
    mlflow.log_metric("f1_score", gb_f1)
    mlflow.sklearn.log_model(gb_clf_pipeline, name = "model_gb_clf", signature=signature, input_example=input_example)
    
    print(f"Gradient Boosting Accuracy: {gb_acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_class_test, y_pred))


--- Training Classification Model 2: Gradient Boosting ---


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Gradient Boosting Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16542
           1       1.00      1.00      1.00      9926

    accuracy                           1.00     26468
   macro avg       1.00      1.00      1.00     26468
weighted avg       1.00      1.00      1.00     26468



In [47]:
print("--- Training Regression Model 1: Random Forest ---")

# Create Pipeline
rf_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42))
])

# Train & Track
with mlflow.start_run(run_name="RF_Regressor"):
    rf_reg_pipeline.fit(X_train, y_reg_train)
    y_pred = rf_reg_pipeline.predict(X_test)
    
    # Metrics
    rf_rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred))
    rf_r2 = r2_score(y_reg_test, y_pred)
    rf_mae = mean_absolute_error(y_reg_test, y_pred)
    
    # MLflow
    input_example = X_train.iloc[:1].copy()
    for col in input_example.select_dtypes(include=['int64', 'int32']).columns:
        input_example[col] = input_example[col].astype(float)
    signature = infer_signature(input_example, y_pred)
    
    mlflow.log_metric("rmse", rf_rmse)
    mlflow.log_metric("r2", rf_r2)
    mlflow.sklearn.log_model(rf_reg_pipeline, name = "model_rf_reg", signature=signature, input_example=input_example)
    
    print(f"Random Forest RMSE: {rf_rmse:.2f} | R2: {rf_r2:.4f} | MAE: {rf_mae:.2f}")

--- Training Regression Model 1: Random Forest ---


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Random Forest RMSE: 23.35 | R2: 0.9872 | MAE: 17.66


In [48]:
print("--- Training Regression Model 2: Gradient Boosting ---")

# Create Pipeline
gb_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))
])

# Train & Track
with mlflow.start_run(run_name="GB_Regressor"):
    gb_reg_pipeline.fit(X_train, y_reg_train)
    y_pred = gb_reg_pipeline.predict(X_test)
    
    # Metrics
    gb_rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred))
    gb_r2 = r2_score(y_reg_test, y_pred)
    gb_mae = mean_absolute_error(y_reg_test, y_pred)
    
    # MLflow
    input_example = X_train.iloc[:1].copy()
    for col in input_example.select_dtypes(include=['int64', 'int32']).columns:
        input_example[col] = input_example[col].astype(float)
    signature = infer_signature(input_example, y_pred)
    
    mlflow.log_metric("rmse", gb_rmse)
    mlflow.log_metric("r2", gb_r2)
    mlflow.sklearn.log_model(gb_reg_pipeline, name = "model_gb_reg", signature=signature, input_example=input_example)
    
    print(f"Gradient Boosting RMSE: {gb_rmse:.2f} | R2: {gb_r2:.4f} | MAE: {gb_mae:.2f}")

--- Training Regression Model 2: Gradient Boosting ---


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Gradient Boosting RMSE: 23.01 | R2: 0.9876 | MAE: 17.47


In [51]:
print("-" * 40)
print("MODEL COMPARISON RESULTS")
print("-" * 40)

# Classification Selection (Metric: Accuracy)
print(f"Classification: Random Forest ({rf_acc:.4f}) vs Gradient Boosting ({gb_acc:.4f})")
if rf_acc > gb_acc:
    best_clf_model = rf_clf_pipeline
    print(">> WINNER: Random Forest")
else:
    best_clf_model = gb_clf_pipeline
    print(">> WINNER: Gradient Boosting")

# Regression Selection (Metric: RMSE, Lower is better)
print(f"Regression: Random Forest ({rf_rmse:.2f}) vs Gradient Boosting ({gb_rmse:.2f})")
if rf_rmse < gb_rmse:
    best_reg_model = rf_reg_pipeline
    print(">> WINNER: Random Forest")
else:
    best_reg_model = gb_reg_pipeline
    print(">> WINNER: Gradient Boosting")

# Save Winners
joblib.dump(best_clf_model, 'best-models/best_model_classification.pkl')
joblib.dump(best_reg_model, 'best-models/best_model_regression.pkl')

# Also Save the City Median Map
# The app needs this to create the 'City_Median_Price' feature for inference!
city_median_map = df.groupby('City')['Price_per_SqFt'].median().to_dict()
joblib.dump(city_median_map, 'city_median_map.pkl')

print("\nBest models and Median Map saved to disk successfully.")


----------------------------------------
MODEL COMPARISON RESULTS
----------------------------------------
Classification: Random Forest (0.9989) vs Gradient Boosting (1.0000)
>> WINNER: Gradient Boosting
Regression: Random Forest (23.35) vs Gradient Boosting (23.01)
>> WINNER: Gradient Boosting

Best models and Median Map saved to disk successfully.
