In [1]:
import pandas as pd

try:
    df = pd.read_csv('cleaned_data.csv')

except Exception as e:
    print('Failed to load the Dataset',e)

print(df.head())

   ID        State      City      Locality      Property_Type  BHK  \
0   1   Tamil Nadu   Chennai   Locality_84          Apartment    1   
1   2  Maharashtra      Pune  Locality_490  Independent House    3   
2   3       Punjab  Ludhiana  Locality_167          Apartment    2   
3   4    Rajasthan   Jodhpur  Locality_393  Independent House    2   
4   5    Rajasthan    Jaipur  Locality_466              Villa    4   

   Size_in_SqFt  Price_in_Lakhs  Price_per_SqFt  Year_Built  ...  \
0          4740          489.76            0.10        1990  ...   
1          2364          195.52            0.08        2008  ...   
2          3642          183.79            0.05        1997  ...   
3          2741          300.29            0.11        1991  ...   
4          4823          182.90            0.04        2002  ...   

  Nearby_Schools  Nearby_Hospitals  Public_Transport_Accessibility  \
0             10                 3                            High   
1              8              

# STEP 1:- FEATURE ENGINEERING
Feature Engineering is the process of creating new, meaningful features from existing data to help machine learning models understand patterns better. In this step, we transform raw property attributes into powerful indicators such as price per square foot, property age, and investment scores. These engineered features improve prediction accuracy, simplify comparisons, and make the dataset more useful for both analysis and modeling.

# 1:- Features Inculsion

In [2]:
try:
    location_growth = {
        "Mumbai": 0.10,
        "Bangalore": 0.09,
        "Delhi": 0.07,
        "Hyderabad": 0.08
    }

    years = 5

    df["growth_rate_location"] = df["City"].map(location_growth).fillna(0.08)

    df['future_price'] = df['Price_in_Lakhs'] * (1 + df['growth_rate_location']) ** years

    import numpy as np

    df['RERA'] = np.where(
        df['Availability_Status'].str.lower().isin(['available', 'ready to move']),
        1,
        0
    )

except Exception as e:
    print('Failed to perform the Feature Inclusion:', e)

df.to_csv("regression_data.csv", index=False)

# Data Splitting

In [3]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

X = df.drop('future_price', axis=1)
y = df['future_price']

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer([
    ("categorical", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols),
    ("numerical", StandardScaler(), num_cols)
])

# STEP 2:- MODEL DEVELOPMENT

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models_regression = [
    (
        'Linear_Regression',
        {"fit_intercept": True},
        LinearRegression()
    ),
    (
        'Random_Forest_Regressor',
        {"n_estimators": 100, "max_depth": 10, "min_samples_split": 2},
        RandomForestRegressor(random_state=42)
    ),
    (
        'XGBoost_Regressor',
        {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 5},
        XGBRegressor(random_state=42)
    )
]

pipelines = []

for name, params, model in models_regression:
    try:
        model.set_params(**params)

        pipe = Pipeline([
            ("preprocessing", preprocessor),
            ("model", model)
        ])

        pipelines.append((name, params, model, pipe))

        print(f"Model Name: {name}")
        print(f"Parameters: {params}")
        print("-" * 40)

    except Exception as e:
        print(f"Failed to create pipeline for {name}: {e}")

Model Name: Linear_Regression
Parameters: {'fit_intercept': True}
----------------------------------------
Model Name: Random_Forest_Regressor
Parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2}
----------------------------------------
Model Name: XGBoost_Regressor
Parameters: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 5}
----------------------------------------


# STEP 3:- MODEL EVALUATION

In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

trained_models = []
reports_regression = []

for model_name, params, model, pipe in pipelines:
    try:
        
        pipe.fit(X_train, y_train)

        
        y_pred = pipe.predict(X_test)

        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        
        rmse = round(rmse, 5)
        mae = round(mae, 5)
        r2 = round(r2, 5)

        
        reports_regression.append((model_name, rmse, mae, r2))
        trained_models.append((model_name, params, model, pipe))

        print(f"Successfully trained {model_name}")
        print(f"Metrics ‚Üí RMSE: {rmse}, MAE: {mae}, R2: {r2}")
        print("-" * 50)

    except Exception as e:
        print(f"Failed to train or evaluate {model_name}: {e}")
        print("-" * 50)

print(reports_regression)

Successfully trained Linear_Regression
Metrics ‚Üí RMSE: 2.7868, MAE: 0.81159, R2: 0.99982
--------------------------------------------------
Successfully trained Random_Forest_Regressor
Metrics ‚Üí RMSE: 0.15624, MAE: 0.10488, R2: 1.0
--------------------------------------------------
Successfully trained XGBoost_Regressor
Metrics ‚Üí RMSE: 0.83938, MAE: 0.71502, R2: 0.99998
--------------------------------------------------
[('Linear_Regression', 2.7868, 0.81159, 0.99982), ('Random_Forest_Regressor', 0.15624, 0.10488, 1.0), ('XGBoost_Regressor', 0.83938, 0.71502, 0.99998)]


# STEP 4:- ML Flow Integration

In [6]:
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings("ignore")

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('Regression')

for i, element in enumerate(trained_models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    pipe = element[3]
    report = reports_regression[i]

    with mlflow.start_run(run_name=model_name) as run:
        mlflow.log_params(params)
        mlflow.log_metrics({
            'RMSE': report[1],
            'MAE': report[2],
            'R2_SCORE': report[3]
        })
        mlflow.sklearn.log_model(
            sk_model=pipe,
            artifact_path="model",
            input_example=X_test.iloc[:1]
        )

        try:
            cat_cols_list = list(cat_cols)
            num_cols_list = list(num_cols)
            full_feature_names = cat_cols_list + num_cols_list

            trained_model = pipe.named_steps[list(pipe.named_steps.keys())[-1]]


            if hasattr(trained_model, "feature_importances_"):
                importances = model.feature_importances_


                feature_importance_dict = {
                    full_feature_names[i]: round(float(importances[i]), 7)
                    for i in range(len(importances))
                }

                mlflow.log_dict(feature_importance_dict, "feature_importances.json")
                print(f"Logged feature importances for {model_name}")
            else:
                print(f"No feature importances available for {model_name}")

        except Exception as e:
            print(f"Error logging feature importances for {model_name}: {e}")

        run_id = run.info.run_id
        print(f"Logged {model_name} with Run ID: {run_id}")

print('Successfully Logged the Model, Parameters, Feature Importances and Metrics')

2025/12/12 04:52:05 INFO mlflow.tracking.fluent: Experiment with name 'Regression' does not exist. Creating a new experiment.


No feature importances available for Linear_Regression
Logged Linear_Regression with Run ID: 88c2e38d4f59413a91374cbc0a49a77f
üèÉ View run Linear_Regression at: http://127.0.0.1:5000/#/experiments/723797886259147576/runs/88c2e38d4f59413a91374cbc0a49a77f
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/723797886259147576




Logged feature importances for Random_Forest_Regressor
Logged Random_Forest_Regressor with Run ID: 92a47046fd3f428db542560d0cd2c2c0
üèÉ View run Random_Forest_Regressor at: http://127.0.0.1:5000/#/experiments/723797886259147576/runs/92a47046fd3f428db542560d0cd2c2c0
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/723797886259147576




Logged feature importances for XGBoost_Regressor
Logged XGBoost_Regressor with Run ID: dd2ef4d4078745cca177ca732a179853
üèÉ View run XGBoost_Regressor at: http://127.0.0.1:5000/#/experiments/723797886259147576/runs/dd2ef4d4078745cca177ca732a179853
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/723797886259147576
Successfully Logged the Model, Parameters, Feature Importances and Metrics


# STEP 5:- MODEL REGISTRY

In [7]:
import warnings
warnings.filterwarnings("ignore")


model_name = "Random_Forest_Regressor"
run_id = "92a47046fd3f428db542560d0cd2c2c0"

model_uri = f"runs:/{run_id}/model"

mlflow.register_model(model_uri=model_uri, name=model_name)

Successfully registered model 'Random_Forest_Regressor'.
2025/12/12 04:53:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random_Forest_Regressor, version 1
Created version '1' of model 'Random_Forest_Regressor'.


<ModelVersion: aliases=[], creation_timestamp=1765536780333, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1765536780333, metrics=None, model_id=None, name='Random_Forest_Regressor', params=None, run_id='92a47046fd3f428db542560d0cd2c2c0', run_link='', source='models:/m-eef088635de3453b8b18fb96ae27d9f5', status='READY', status_message=None, tags={}, user_id='', version='1'>

In [8]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

model = mlflow.pyfunc.load_model("models:/Random_Forest_Regressor@challenger")
print("Model loaded successfully")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model loaded successfully


In [9]:
import pandas as pd

sample = pd.read_csv("sample_input.csv")

prediction = model.predict(sample)

print("Predicted Future Price:", prediction)

Predicted Future Price: [441.20832501]
