<a href="https://colab.research.google.com/github/Akimakasare/Akash_Makasare_Real-Estate/blob/main/Akash_Makasare___Real_Estate_Investment_Advisor_Predicting_Property_Profitability_%26_Future_Value.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **EDA Analysis**

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- Data Loading and Cleaning ---
file_path = "India_housing_prices (Akash Makasare).csv"
df = pd.read_csv(file_path)

# Drop rows with any missing values and ensure numerical columns are of the correct type
df_cleaned = df.dropna(subset=['City', 'Price_in_Lakhs', 'Size_in_SqFt', 'Nearby_Schools', 'Nearby_Hospitals', 'Public_Transport_Accessibility', 'Security']).copy()
df_cleaned['Size_in_SqFt'] = df_cleaned['Size_in_SqFt'].astype(float)
df_cleaned['Price_in_Lakhs'] = df_cleaned['Price_in_Lakhs'].astype(float)


# --- 1. Price trends by city ---
print("\n--- 1. Price trends by city ---")
# Calculate the median price per city
city_price_trends = df_cleaned.groupby('City')['Price_in_Lakhs'].median().sort_values(ascending=False).head(10)
print("Top 10 Cities by Median Price (in Lakhs):")
print(city_price_trends)

# Plotting the top 10 cities
plt.figure(figsize=(10, 6))
sns.barplot(x=city_price_trends.index, y=city_price_trends.values, palette="viridis")
plt.title('Top 10 Cities by Median House Price (in Lakhs)')
plt.xlabel('City')
plt.ylabel('Median Price (in Lakhs)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('price_trends_by_city.png')
plt.close()
print("Saved plot: price_trends_by_city.png")


# --- 2. Correlation between area and investment return (Proxy: Price_in_Lakhs) ---
print("\n--- 2. Correlation between area (Size_in_SqFt) and investment return (Proxy: Price_in_Lakhs) ---")
# Calculate the correlation coefficient
correlation = df_cleaned['Size_in_SqFt'].corr(df_cleaned['Price_in_Lakhs'])
print(f"Pearson Correlation (Size_in_SqFt vs Price_in_Lakhs): {correlation:.4f}")

# Plotting the relationship (sampling for faster plotting if the dataset is too large)
df_sample = df_cleaned.sample(n=min(len(df_cleaned), 10000), random_state=42)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Size_in_SqFt', y='Price_in_Lakhs', data=df_sample, alpha=0.6, s=15)
plt.title('Area (SqFt) vs. Price (in Lakhs)')
plt.xlabel('Size in SqFt')
plt.ylabel('Price in Lakhs')
plt.grid(True)
plt.savefig('area_vs_price_correlation.png')
plt.close()
print("Saved plot: area_vs_price_correlation.png")


# --- 3. Impact of crime rate (Proxy: inverse of Security) on good investment classification (Proxy: Public_Transport_Accessibility) ---
print("\n--- 3. Impact of Security (inverse of Crime Rate) on Public Transport Accessibility (Good Investment Classification) ---")

# Create a contingency table (crosstab)
crosstab_df = pd.crosstab(df_cleaned['Public_Transport_Accessibility'], df_cleaned['Security'], normalize='index') * 100
print("Proportion of Security Status by Transport Accessibility (%):")
print(crosstab_df)

# Plotting the relationship
crosstab_df.plot(kind='bar', stacked=True, figsize=(8, 6))
plt.title('Security Status by Public Transport Accessibility')
plt.xlabel('Public Transport Accessibility (Good Investment Classification Proxy)')
plt.ylabel('Proportion (%)')
plt.xticks(rotation=0)
plt.legend(title='Security Status (Inverse of Crime Rate Proxy)')
plt.tight_layout()
plt.savefig('security_vs_transport_accessibility.png')
plt.close()
print("Saved plot: security_vs_transport_accessibility.png")


# --- 4. Relationship between infrastructure score (Proxy: Nearby_Schools + Nearby_Hospitals) and resale value (Proxy: Price_in_Lakhs) ---
print("\n--- 4. Relationship between Infrastructure Score (Schools + Hospitals) and Resale Value (Proxy: Price_in_Lakhs) ---")
# Calculate the Infrastructure Score
df_cleaned['Infrastructure_Score'] = df_cleaned['Nearby_Schools'] + df_cleaned['Nearby_Hospitals']

# Calculate the correlation coefficient
correlation_infra = df_cleaned['Infrastructure_Score'].corr(df_cleaned['Price_in_Lakhs'])
print(f"Pearson Correlation (Infrastructure_Score vs Price_in_Lakhs): {correlation_infra:.4f}")

# Plotting the relationship
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Infrastructure_Score', y='Price_in_Lakhs', data=df_sample.assign(Infrastructure_Score=df_sample['Nearby_Schools'] + df_sample['Nearby_Hospitals']), alpha=0.6, s=15)
plt.title('Infrastructure Score (Schools+Hospitals) vs. Price (in Lakhs)')
plt.xlabel('Infrastructure Score')
plt.ylabel('Price in Lakhs')
plt.grid(True)
plt.savefig('infrastructure_vs_price_correlation.png')
plt.close()
print("Saved plot: infrastructure_vs_price_correlation.png")


--- 1. Price trends by city ---
Top 10 Cities by Median Price (in Lakhs):
City
Mysore        272.945
Bhopal        271.295
Jaipur        268.260
Dehradun      264.380
Chennai       263.170
Silchar       262.305
Cuttack       261.515
Mangalore     260.685
Nagpur        260.510
Vijayawada    260.170
Name: Price_in_Lakhs, dtype: float64



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=city_price_trends.index, y=city_price_trends.values, palette="viridis")


Saved plot: price_trends_by_city.png

--- 2. Correlation between area (Size_in_SqFt) and investment return (Proxy: Price_in_Lakhs) ---
Pearson Correlation (Size_in_SqFt vs Price_in_Lakhs): -0.0059
Saved plot: area_vs_price_correlation.png

--- 3. Impact of Security (inverse of Crime Rate) on Public Transport Accessibility (Good Investment Classification) ---
Proportion of Security Status by Transport Accessibility (%):
Security                               No        Yes
Public_Transport_Accessibility                      
High                            50.018219  49.981781
Low                             50.031078  49.968922
Medium                          49.926817  50.073183
Saved plot: security_vs_transport_accessibility.png

--- 4. Relationship between Infrastructure Score (Schools + Hospitals) and Resale Value (Proxy: Price_in_Lakhs) ---
Pearson Correlation (Infrastructure_Score vs Price_in_Lakhs): -0.0076
Saved plot: infrastructure_vs_price_correlation.png


# **MODEL Development **

In [29]:
!pip install pandas scikit-learn numpy
!pip install xgboost



In [32]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import xgboost as xgb  # Uncomment this if you plan to use XGBoost

# Suppress minor warnings for cleaner notebook output
warnings.filterwarnings('ignore')

# --- Step 1: Data Loading and Feature Engineering ---

# Load the dataset
file_path = "India_housing_prices (Akash Makasare).csv"
df = pd.read_csv(file_path, on_bad_lines='skip', engine='python') # Added engine='python'

# 1. Clean Data: Drop rows with missing values in key columns
key_cols = ['BHK', 'Size_in_SqFt', 'Price_in_Lakhs', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals', 'Public_Transport_Accessibility', 'Property_Type', 'Furnished_Status', 'Security']
df_cleaned = df.dropna(subset=key_cols).copy()

# 2. Feature Engineering for Targets (Proxies)
# Classification Target: Good_Investment (Proxy: High Public Transport Accessibility)
df_cleaned['Good_Investment'] = df_cleaned['Public_Transport_Accessibility'].apply(
    lambda x: 1 if x == 'High' else 0
)

# Regression Target: Future_Price_5Y (Proxy: Current Price_in_Lakhs)
df_cleaned['Future_Price_5Y'] = df_cleaned['Price_in_Lakhs']


# 3. Define Features and Preprocessing Steps
numerical_features = ['BHK', 'Size_in_SqFt', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals']
categorical_features = ['City', 'Property_Type', 'Furnished_Status', 'Security', 'Owner_Type']

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop' # Drop all other columns not specified
)

# ----------------------------------------------------------------------------------------------------
# --- Step 2: Classification Model Development (Target: Good_Investment) ---
# ----------------------------------------------------------------------------------------------------

print("="*50)
print("CLASSIFICATION MODEL: Predicting Good_Investment (Proxy: High Transport Access)")
print("="*50)

# Define X and y for Classification
X_clf = df_cleaned.drop(['Good_Investment', 'Price_in_Lakhs', 'Future_Price_5Y'], axis=1)
y_clf = df_cleaned['Good_Investment']

# Split Data
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# --- Model Selection: RandomForestClassifier ---
clf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

# Train Model
clf_model.fit(X_train_clf, y_train_clf)

# Predict
y_pred_clf = clf_model.predict(X_test_clf)
y_proba_clf = clf_model.predict_proba(X_test_clf)[:, 1]

# Evaluate Classification Metrics
print("\n--- Evaluation Metrics (RandomForestClassifier) ---")
print(f"Accuracy: {accuracy_score(y_test_clf, y_pred_clf):.4f}")
print(f"Precision: {precision_score(y_test_clf, y_pred_clf):.4f}")
print(f"Recall: {recall_score(y_test_clf, y_pred_clf):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test_clf, y_proba_clf):.4f}")


# ----------------------------------------------------------------------------------------------------
# --- Step 3: Regression Model Development (Target: Future_Price_5Y) ---
# ----------------------------------------------------------------------------------------------------

print("\n" + "="*50)
print("REGRESSION MODEL: Predicting Future_Price_5Y (Proxy: Current Price)")
print("="*50)

# Define X and y for Regression
X_reg = df_cleaned.drop(['Good_Investment', 'Price_in_Lakhs', 'Future_Price_5Y'], axis=1)
y_reg = df_cleaned['Future_Price_5Y']

# Split Data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# --- Model Selection: Linear Regression (Example) ---
# To use other models, uncomment and replace LinearRegression():
# regressor_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
# regressor_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

reg_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train Model
reg_model.fit(X_train_reg, y_train_reg)

# Predict
y_pred_reg = reg_model.predict(X_test_reg)

# Evaluate Regression Metrics
# Ensure predictions are non-negative
y_pred_reg[y_pred_reg < 0] = 0

print("\n--- Evaluation Metrics (Linear Regression) ---")
print(f"RMSE (Root Mean Squared Error): {np.sqrt(mean_squared_error(y_test_reg, y_pred_reg)):.4f}")
print(f"MAE (Mean Absolute Error): {mean_absolute_error(y_test_reg, y_pred_reg):.4f}")
print(f"R-squared ($R^2$): {r2_score(y_test_reg, y_pred_reg):.4f}")

CLASSIFICATION MODEL: Predicting Good_Investment (Proxy: High Transport Access)

--- Evaluation Metrics (RandomForestClassifier) ---
Accuracy: 0.6539
Precision: 0.3379
Recall: 0.0352
ROC AUC: 0.5023

REGRESSION MODEL: Predicting Future_Price_5Y (Proxy: Current Price)

--- Evaluation Metrics (Linear Regression) ---
RMSE (Root Mean Squared Error): 141.1969
MAE (Mean Absolute Error): 122.3189
R-squared ($R^2$): -0.0001


# **MLflow Integration**

In [30]:
!pip install mlflow pyngrok

Collecting mlflow
  Downloading mlflow-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting mlflow-skinny==3.6.0 (from mlflow)
  Downloading mlflow_skinny-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.6.0 (from mlflow)
  Downloading mlflow_tracing-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Downloading huey-2.5.4-py3-none-any.whl.metadata (4.6 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.6.0->mlflow)
  Downloading databric

In [31]:
import pandas as pd
import numpy as np
import warnings
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

warnings.filterwarnings('ignore')

# --- 1. Data Preparation (Reusing previous steps) ---

file_path = "India_housing_prices (Akash Makasare).csv"
df = pd.read_csv(file_path)

key_cols = ['BHK', 'Size_in_SqFt', 'Price_in_Lakhs', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals', 'Public_Transport_Accessibility', 'Property_Type', 'Furnished_Status', 'Security']
df_cleaned = df.dropna(subset=key_cols).copy()

# Feature Engineering for Targets (Proxies)
df_cleaned['Good_Investment'] = df_cleaned['Public_Transport_Accessibility'].apply(
    lambda x: 1 if x == 'High' else 0
)
df_cleaned['Future_Price_5Y'] = df_cleaned['Price_in_Lakhs']

# Define Features and Preprocessing Steps
numerical_features = ['BHK', 'Size_in_SqFt', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals']
categorical_features = ['City', 'Property_Type', 'Furnished_Status', 'Security', 'Owner_Type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

# Define common X and split data
X = df_cleaned.drop(['Good_Investment', 'Price_in_Lakhs', 'Future_Price_5Y'], axis=1)
y_clf = df_cleaned['Good_Investment']
y_reg = df_cleaned['Future_Price_5Y']

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42, stratify=y_clf)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)


# --- 2. MLflow Setup ---

# Set up MLflow tracking (defaults to local `./mlruns` directory)
experiment_name = "Housing_Price_Prediction_EDA"
mlflow.set_experiment(experiment_name)
print(f"MLflow Experiment Set: {experiment_name}")


# --- 3. Classification Experiment Function ---

def run_classification_experiment(model_class, model_params, model_name, X_train, X_test, y_train, y_test):
    """Trains, evaluates, and logs a classification model using MLflow."""
    with mlflow.start_run(run_name=model_name) as run:
        print(f"\nStarting MLflow Run for: {model_name}")

        # 1. Log Parameters
        mlflow.log_params(model_params)

        # 2. Define Model Pipeline
        clf_model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model_class(**model_params, random_state=42))
        ])

        # 3. Train Model
        clf_model.fit(X_train, y_train)

        # 4. Predict and Evaluate
        y_pred = clf_model.predict(X_test)
        y_proba = clf_model.predict_proba(X_test)[:, 1]

        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "roc_auc": roc_auc_score(y_test, y_proba)
        }

        # 5. Log Metrics
        mlflow.log_metrics(metrics)
        print(f"Metrics Logged: ROC AUC={metrics['roc_auc']:.4f}")

        # 6. Log Model Artifact
        # The artifact path will be 'classification_model' within the run
        mlflow.sklearn.log_model(
            sk_model=clf_model,
            artifact_path="classification_model",
            registered_model_name=f"Classification_{model_name}"
        )
        print("Model artifact logged and registered.")

        return metrics['roc_auc']

# --- 4. Regression Experiment Function ---

def run_regression_experiment(model_class, model_params, model_name, X_train, X_test, y_train, y_test):
    """Trains, evaluates, and logs a regression model using MLflow."""
    with mlflow.start_run(run_name=model_name) as run:
        print(f"\nStarting MLflow Run for: {model_name}")

        # 1. Log Parameters
        mlflow.log_params(model_params)

        # 2. Define Model Pipeline
        reg_model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model_class(**model_params, random_state=42) if 'random_state' in model_class.__init__.__code__.co_varnames else model_class(**model_params))
        ])

        # 3. Train Model
        reg_model.fit(X_train, y_train)

        # 4. Predict and Evaluate
        y_pred = reg_model.predict(X_test)
        y_pred[y_pred < 0] = 0 # Ensure predictions are non-negative

        metrics = {
            "rmse": np.sqrt(mean_squared_error(y_test, y_pred)),
            "mae": mean_absolute_error(y_test, y_pred),
            "r2": r2_score(y_test, y_pred)
        }

        # 5. Log Metrics
        mlflow.log_metrics(metrics)
        print(f"Metrics Logged: R2={metrics['r2']:.4f}")

        # 6. Log Model Artifact
        # The artifact path will be 'regression_model' within the run
        mlflow.sklearn.log_model(
            sk_model=reg_model,
            artifact_path="regression_model",
            registered_model_name=f"Regression_{model_name}"
        )
        print("Model artifact logged and registered.")

        return metrics['r2']


# --- 5. Running the Experiments ---

# Classification Experiments
clf_runs = {}
clf_runs['LogisticRegression'] = run_classification_experiment(
    LogisticRegression, {'solver': 'liblinear', 'C': 1.0}, 'Logistic_Regression_V1',
    X_train_clf, X_test_clf, y_train_clf, y_test_clf
)
clf_runs['RandomForestClassifier'] = run_classification_experiment(
    RandomForestClassifier, {'n_estimators': 100, 'max_depth': 10, 'n_jobs': -1}, 'Random_Forest_V1',
    X_train_clf, X_test_clf, y_train_clf, y_test_clf
)

# Regression Experiments
reg_runs = {}
reg_runs['LinearRegression'] = run_regression_experiment(
    LinearRegression, {}, 'Linear_Regression_V1',
    X_train_reg, X_test_reg, y_train_reg, y_test_reg
)
reg_runs['RandomForestRegressor'] = run_regression_experiment(
    RandomForestRegressor, {'n_estimators': 100, 'max_depth': 10, 'n_jobs': -1}, 'Random_Forest_V1',
    X_train_reg, X_test_reg, y_train_reg, y_test_reg
)

# --- 6. Final Summary ---
print("\n" + "="*50)
print("MLflow Experiments Completed.")
print(f"Classification Best ROC AUC: {max(clf_runs.values()):.4f} ({max(clf_runs, key=clf_runs.get)})")
print(f"Regression Best R2: {max(reg_runs.values()):.4f} ({max(reg_runs, key=reg_runs.get)})")
print("View the MLflow UI by running the next code block.")

2025/12/03 01:03:25 INFO mlflow.tracking.fluent: Experiment with name 'Housing_Price_Prediction_EDA' does not exist. Creating a new experiment.


MLflow Experiment Set: Housing_Price_Prediction_EDA

Starting MLflow Run for: Logistic_Regression_V1




Metrics Logged: ROC AUC=0.4993


Successfully registered model 'Classification_Logistic_Regression_V1'.
Created version '1' of model 'Classification_Logistic_Regression_V1'.


Model artifact logged and registered.

Starting MLflow Run for: Random_Forest_V1




Metrics Logged: ROC AUC=0.4986


Successfully registered model 'Classification_Random_Forest_V1'.
Created version '1' of model 'Classification_Random_Forest_V1'.


Model artifact logged and registered.

Starting MLflow Run for: Linear_Regression_V1




Metrics Logged: R2=-0.0001


Successfully registered model 'Regression_Linear_Regression_V1'.
Created version '1' of model 'Regression_Linear_Regression_V1'.


Model artifact logged and registered.

Starting MLflow Run for: Random_Forest_V1




Metrics Logged: R2=-0.0007




Model artifact logged and registered.

MLflow Experiments Completed.
Classification Best ROC AUC: 0.4993 (LogisticRegression)
Regression Best R2: -0.0001 (LinearRegression)
View the MLflow UI by running the next code block.


Successfully registered model 'Regression_Random_Forest_V1'.
Created version '1' of model 'Regression_Random_Forest_V1'.


# **Streamlit App**

In [36]:
!pip install streamlit pandas numpy scikit-learn mlflow plotly




In [40]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
import numpy as np
import mlflow.sklearn
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from mlflow.models.signature import infer_signature
import warnings

# Suppress warnings for cleaner UI
warnings.filterwarnings('ignore')

# --- MLFLOW CONFIGURATION AND MODEL LOADING ---
# NOTE: This assumes the models were registered locally in Step 4.
# We target the Random Forest models which allow for Feature Importance.
CLASSIFICATION_MODEL_NAME = "Classification_Random_Forest_V1"
REGRESSION_MODEL_NAME = "Regression_Random_Forest_V1"
MODEL_VERSION = "1" # Assuming the first version registered

# Set up the MLflow tracking URI (defaults to local './mlruns')
mlflow.set_tracking_uri("file://" + mlflow.get_tracking_uri().replace("file://", ""))


@st.cache_resource(show_spinner="Loading Models and Data...")
def load_models_and_data():
    """Loads data, defines preprocessing, and loads trained models from MLflow."""
    try:
        # Load the pipeline components and models
        clf_model = mlflow.sklearn.load_model(f"models:/{CLASSIFICATION_MODEL_NAME}/{MODEL_VERSION}")
        reg_model = mlflow.sklearn.load_model(f"models:/{REGRESSION_MODEL_NAME}/{MODEL_VERSION}")

    except Exception as e:
        st.error(f"Error loading models from MLflow Registry. Ensure Step 4 was run and models were registered locally. Error: {e}")
        # Return dummy models to prevent app crash
        clf_model = None
        reg_model = None

    # --- Data Preparation ---
    file_path = "India_housing_prices (Akash Makasare).csv"
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        st.error(f"File not found: {file_path}. Please ensure the CSV is in the same directory.")
        return None, None, None, None

    key_cols = ['BHK', 'Size_in_SqFt', 'Price_in_Lakhs', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals', 'Public_Transport_Accessibility', 'Property_Type', 'Furnished_Status', 'Security', 'Owner_Type', 'City']
    df_cleaned = df.dropna(subset=key_cols).copy()

    # Feature Engineering for Targets (Proxies)
    df_cleaned['Good_Investment'] = df_cleaned['Public_Transport_Accessibility'].apply(
        lambda x: 1 if x == 'High' else 0
    )
    df_cleaned['Future_Price_5Y'] = df_cleaned['Price_in_Lakhs']

    # Define features and preprocessor (must be identical to training)
    numerical_features = ['BHK', 'Size_in_SqFt', 'Age_of_Property', 'Nearby_Schools', 'Nearby_Hospitals']
    categorical_features = ['City', 'Property_Type', 'Furnished_Status', 'Security', 'Owner_Type']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='drop'
    )

    # Fit preprocessor on the original features for consistent transformations
    X = df_cleaned.drop(['Good_Investment', 'Price_in_Lakhs', 'Future_Price_5Y'], axis=1)
    preprocessor.fit(X)

    return df_cleaned, clf_model, reg_model, preprocessor

# --- Load Data and Models ---
df, clf_model, reg_model, preprocessor = load_models_and_data()

if df is None:
    st.stop()


# --- FEATURE IMPORTANCE EXTRACTION ---
def get_feature_importance(model_pipeline):
    """Extracts feature importance from the Random Forest model."""
    try:
        # The Random Forest Estimator is the last step in the pipeline
        rf_model = model_pipeline.named_steps['classifier'] if 'classifier' in model_pipeline.named_steps else model_pipeline.named_steps['regressor']
        importances = rf_model.feature_importances_

        # Get feature names from the ColumnTransformer
        feature_names = preprocessor.get_feature_names_out()

        # Create DataFrame and sort
        feature_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
        feature_df['feature'] = feature_df['feature'].str.replace(r'^(num|cat)__', '', regex=True)
        return feature_df.sort_values(by='importance', ascending=False)
    except Exception as e:
        st.warning(f"Could not extract feature importance. Ensure model is Random Forest. Error: {e}")
        return pd.DataFrame({'feature': ['N/A'], 'importance': [0]})

clf_importance_df = get_feature_importance(clf_model)
reg_importance_df = get_feature_importance(reg_model)


# --- STREAMLIT APP LAYOUT ---
st.set_page_config(layout="wide", page_title="India Housing Investment Predictor")

st.title("üè° Housing Investment & Price Predictor")
st.markdown("Use this application to analyze investment potential and predict the future price of residential properties in India using ML models trained with MLflow.")


# --- SIDEBAR FOR PREDICTION INPUT ---
with st.sidebar:
    st.header("üîÆ Property Prediction Form")

    # Input fields for prediction
    input_data = {}

    # Numerical Inputs
    input_data['BHK'] = st.slider("BHK (Bedrooms)", 1, 6, 2)
    input_data['Size_in_SqFt'] = st.number_input("Size in SqFt", 500, 10000, 1500)
    input_data['Age_of_Property'] = st.slider("Age of Property (Years)", 0, 50, 5)
    input_data['Nearby_Schools'] = st.slider("Nearby Schools (Count)", 0, 20, 5)
    input_data['Nearby_Hospitals'] = st.slider("Nearby Hospitals (Count)", 0, 20, 2)

    # Categorical Inputs
    input_data['City'] = st.selectbox("City", df['City'].unique())
    input_data['Property_Type'] = st.selectbox("Property Type", df['Property_Type'].unique())
    input_data['Furnished_Status'] = st.selectbox("Furnished Status", df['Furnished_Status'].unique())
    input_data['Security'] = st.selectbox("Security (Yes/No)", df['Security'].unique())
    input_data['Owner_Type'] = st.selectbox("Owner Type", df['Owner_Type'].unique())

    # Placeholders for non-used columns to match training schema
    # The preprocessor handles these by dropping them, but we need them in the input DataFrame
    input_data['Public_Transport_Accessibility'] = 'Low' # Does not affect prediction outcome as it's the target proxy
    input_data['Price_in_Lakhs'] = 100 # Dummy value

    predict_button = st.button("Generate Prediction")

# --- MAIN CONTENT ---

# TABS for EDA/Model Insights and Prediction Results
tab_predict, tab_insights = st.tabs(["Prediction Results", "Data & Model Insights"])

with tab_predict:
    if predict_button:
        # Create input DataFrame for prediction
        input_df = pd.DataFrame([input_data])

        # --- CLASSIFICATION PREDICTION ---
        clf_proba = clf_model.predict_proba(input_df)[0]
        clf_pred = clf_model.predict(input_df)[0]

        is_good_investment = "YES" if clf_pred == 1 else "NO"
        confidence_score = clf_proba[clf_pred] * 100

        # --- REGRESSION PREDICTION ---
        reg_pred = reg_model.predict(input_df)[0]
        estimated_price = max(0, reg_pred) # Price cannot be negative

        st.header("Prediction Outcomes")
        col1, col2 = st.columns(2)

        with col1:
            st.metric(
                label="Is this a Good Investment?",
                value=is_good_investment,
                delta=f"Confidence: {confidence_score:.2f}%"
            )
            st.caption(f"*Definition of 'Good Investment' used in training: High Public Transport Accessibility.*")

        with col2:
            st.metric(
                label="Estimated Price after 5 Years (Lakhs)",
                value=f"‚Çπ {estimated_price:,.2f} L",
            )
            st.caption(f"*Note: The 'Future Price' model is a proxy for the current price.*")

    else:
        st.info("üëà Enter property details in the sidebar and click 'Generate Prediction' to see the results.")

with tab_insights:
    st.header("üìä Exploratory Data Analysis & Visual Insights")

    # 1. Price Distribution by Property Type
    st.subheader("Price Distribution by Property Type")
    fig_price_type = px.box(
        df,
        x='Property_Type',
        y='Price_in_Lakhs',
        color='Property_Type',
        title="Price Distribution Across Property Types (Lakhs)",
        log_y=True,
        template="plotly_white"
    )
    st.plotly_chart(fig_price_type, use_container_width=True)

    # 2. Location-wise Price Heatmap (Using City)
    st.subheader("Average Price by City")
    avg_price_city = df.groupby('City')['Price_in_Lakhs'].mean().sort_values(ascending=False).reset_index()
    fig_city_price = px.bar(
        avg_price_city,
        x='City',
        y='Price_in_Lakhs',
        color='Price_in_Lakhs',
        title="Average Property Price per City (Lakhs)",
        template="plotly_white"
    )
    st.plotly_chart(fig_city_price, use_container_width=True)

    st.header("‚öôÔ∏è Model Insights: Feature Importance")
    col3, col4 = st.columns(2)

    if not clf_importance_df.empty and clf_importance_df['importance'].sum() > 0:
        with col3:
            st.subheader("Classification Model Feature Importance")
            fig_clf = px.bar(
                clf_importance_df.head(10),
                x='importance',
                y='feature',
                orientation='h',
                title=f"Top 10 Features for {CLASSIFICATION_MODEL_NAME}",
                template="plotly_dark"
            )
            fig_clf.update_layout(yaxis={'categoryorder':'total ascending'})
            st.plotly_chart(fig_clf, use_container_width=True)

        with col4:
            st.subheader("Regression Model Feature Importance")
            fig_reg = px.bar(
            reg_importance_df.head(10),
            x='importance',
            y='feature',
            orientation='h',
            title=f"Top 10 Features for {REGRESSION_MODEL_NAME}",
            template="plotly_dark"
        )
        fig_reg.update_layout(yaxis={'categoryorder':'total ascending'})
        st.plotly_chart(fig_reg, use_container_width=True)
    else:
        st.warning("Feature importance could not be calculated. Ensure the models loaded are Random Forest based.")

st.markdown("---")
st.caption(f"Models Loaded: Classification ({CLASSIFICATION_MODEL_NAME} V{MODEL_VERSION}), Regression ({REGRESSION_MODEL_NAME} V{MODEL_VERSION})")


Writing streamlit_app.py


In [69]:
# This command saves the current Python code to a file named 'streamlit_app.py'
# and then executes it using Streamlit and ngrok.

# 1. Start ngrok tunnel and get the URL
# 2. Run streamlit application using the tunnel URL

from pyngrok import ngrok
import subprocess
import os

# Stop any existing ngrok tunnels
ngrok.kill()

# Define the Streamlit command
# Assuming 'streamlit_app.py' is the file name for the Canvas content
command = ["streamlit", "run", "streamlit_app.py", "--server.port", "8501", "--browser.serverAddress", "localhost"]

# Start the Streamlit process in the background
process = subprocess.Popen(command)

# Open a tunnel to the Streamlit port (8501 is the default Streamlit port)
try:
    # pyngrok will automatically use the NGROK_AUTHTOKEN environment variable
    public_url = ngrok.connect(8501)
    print(f"Streamlit App is running at: {public_url}")
    print("Click the link above to view your app.")
except Exception as e:
    print(f"Error opening ngrok tunnel: {e}")
    print("You might need to install ngrok or ensure it's on your path.")

# Note: To stop the app, you would typically run:
# process.terminate()
# ngrok.kill()

ERROR:pyngrok.process.ngrok:t=2025-12-03T01:36:39+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: <YOUR_AUTH_TOKEN>\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"
ERROR:pyngrok.process.ngrok:t=2025-12-03T01:36:39+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: <YOUR_AUTH_TOKEN>\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"
ERROR:pyngrok.process.ngrok:t=2025-12-03T01:36:39+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\

Error opening ngrok tunnel: The ngrok process errored on start: authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: <YOUR_AUTH_TOKEN>\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n.
You might need to install ngrok or ensure it's on your path.


In [60]:
!ngrok config add-authtoken <36BTeHepw8u7wXO3aVPKmSCfIXg_oEFoRGy2JV1T9gm1eFQ5>

/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `ngrok config add-authtoken <36BTeHepw8u7wXO3aVPKmSCfIXg_oEFoRGy2JV1T9gm1eFQ5>'


In [71]:
import os

# Replace <YOUR_AUTH_TOKEN> with your actual ngrok authtoken
# This sets the NGROK_AUTHTOKEN environment variable for pyngrok to use.
os.environ["NGROK_AUTHTOKEN"] = "36BTeHepw8u7wXO3aVPKmSCfIXg_oEFoRGy2JV1T9gm1eFQ5"
print("NGROK_AUTHTOKEN environment variable set.")

NGROK_AUTHTOKEN environment variable set.


In [72]:
# This command saves the current Python code to a file named 'streamlit_app.py'
# and then executes it using Streamlit and ngrok.

# 1. Start ngrok tunnel and get the URL
# 2. Run streamlit application using the tunnel URL

from pyngrok import ngrok
import subprocess
import os

# Stop any existing ngrok tunnels
ngrok.kill()

# Define the Streamlit command
# Assuming 'streamlit_app.py' is the file name for the Canvas content
command = ["streamlit", "run", "streamlit_app.py", "--server.port", "8501", "--browser.serverAddress", "localhost"]

# Start the Streamlit process in the background
process = subprocess.Popen(command)

# Open a tunnel to the Streamlit port (8501 is the default Streamlit port)
try:
    # pyngrok will automatically use the NGROK_AUTHTOKEN environment variable
    public_url = ngrok.connect(8501)
    print(f"Streamlit App is running at: {public_url}")
    print("Click the link above to view your app.")
except Exception as e:
    print(f"Error opening ngrok tunnel: {e}")
    print("You might need to install ngrok or ensure it's on your path.")

# Note: To stop the app, you would typically run:
# process.terminate()
# ngrok.kill()

Streamlit App is running at: NgrokTunnel: "https://uninterpretively-noneuphonious-tess.ngrok-free.dev" -> "http://localhost:8501"
Click the link above to view your app.
