In [1]:
import pandas as pd

try:
    df = pd.read_csv('regression_data.csv')

except Exception as e:
    print('Failed to load the Dataset',e)

print(df.head())

   ID        State      City      Locality      Property_Type  BHK  \
0   1   Tamil Nadu   Chennai   Locality_84          Apartment    1   
1   2  Maharashtra      Pune  Locality_490  Independent House    3   
2   3       Punjab  Ludhiana  Locality_167          Apartment    2   
3   4    Rajasthan   Jodhpur  Locality_393  Independent House    2   
4   5    Rajasthan    Jaipur  Locality_466              Villa    4   

   Size_in_SqFt  Price_in_Lakhs  Price_per_SqFt  Year_Built  ...  \
0          4740          489.76            0.10        1990  ...   
1          2364          195.52            0.08        2008  ...   
2          3642          183.79            0.05        1997  ...   
3          2741          300.29            0.11        1991  ...   
4          4823          182.90            0.04        2002  ...   

  Parking_Space  Security                                 Amenities  Facing  \
0            No        No  Playground, Gym, Garden, Pool, Clubhouse    West   
1           

# Feature Engineering

In [2]:
import numpy as np

try:
    df['Investment_Score'] = (
          (df['Price_per_SqFt'].rank(pct=True) * 0.4) +
          (df['Size_in_SqFt'].rank(pct=True) * 0.3) +
          (df['Age_of_Property'].rank(pct=True) * 0.3)
    )
    
    df['RERA'] = np.where(
        df['Availability_Status'].str.lower().isin(['under_construction', 'ready_to_move']),
        1,
        0
    )
    
    df['multi_factor_score'] = (
        (df['BHK'] >= 3).astype(int) +
        (df['RERA'] == 1).astype(int) +
        (df['Availability_Status'].str.lower().isin(['under_construction', 'ready_to_move'])).astype(int)
    )
    
    threshold = df['Investment_Score'].quantile(0.50)
    
    median_price = df['Price_in_Lakhs'].median()
    median_ppsqft = df['Price_per_SqFt'].median()
    
    df['Good_Investment'] = np.where(
        (
            (df['Investment_Score'] >= threshold) &
            (
                (df['Price_in_Lakhs'] <= median_price) |
                (df['Price_per_SqFt'] <= median_ppsqft) |
                (df['multi_factor_score'] >= 1)
            )
        ),
        1,
        0
    )
    print("Feature Engineering Completed")

    
except Exception as e:
    print('Failed to perform the Feature Inclusion:', e)

df.to_csv("classification_data.csv", index=False)

Feature Engineering Completed


# Encoding and Scaling

In [3]:
X_clf = df.drop(["Good_Investment", "future_price"], axis=1)
y_clf = df["Good_Investment"]

cat_cols = X_clf.select_dtypes(include="object").columns
num_cols = X_clf.select_dtypes(include=["int64", "float64"]).columns


from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ("categorical", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols),
    ("numerical", StandardScaler(), num_cols)
])

# Model Development

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

models_classification = [
    (
        'Logistic_Regression',
        {"penalty": "l2", "C": 1.0, "max_iter":100, "solver":'liblinear'},
        LogisticRegression(random_state=42)
    ),

    (
        'Random_Forest_Classifier',
        {"n_estimators": 200, "max_depth": 10, "min_samples_split": 2},
        RandomForestClassifier(random_state=42)
    ),

    (
        'XGBoost_Classifier',
        {"n_estimators": 200, "learning_rate": 0.1, "max_depth": 5},
        XGBClassifier(random_state=42, eval_metric="logloss")
    )
]

for name, params, model in models_classification:
    print("Name:", name)
    print("Parameters:", params)
    print()

Name: Logistic_Regression
Parameters: {'penalty': 'l2', 'C': 1.0, 'max_iter': 100, 'solver': 'liblinear'}

Name: Random_Forest_Classifier
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2}

Name: XGBoost_Classifier
Parameters: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 5}



# Model Evaluation

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42, shuffle=True)

from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix)

trained_classification = []
reports_classification = []

for model_name, params, model in models_classification:

    model.set_params(**params)

    clf_pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])

    clf_pipeline.fit(X_train, y_train)

    y_pred = clf_pipeline.predict(X_test)
    y_prob = clf_pipeline.predict_proba(X_test)[:, 1]

    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    precision = round(precision_score(y_test, y_pred) * 100, 2)
    recall = round(recall_score(y_test, y_pred) * 100, 2)
    f1 = round(f1_score(y_test, y_pred) * 100, 2)
    roc_auc = round(roc_auc_score(y_test, y_prob), 5)
    cm = confusion_matrix(y_test, y_pred)

    trained_classification.append((model_name, params, clf_pipeline))
    reports_classification.append((model_name, accuracy, precision, recall, f1, roc_auc, cm))
    
print("\nClassification Reports:\n")
for report in reports_classification:
    print(report)
    print('-' * 50)


Classification Reports:

('Logistic_Regression', 99.87, 99.85, 99.89, 99.87, 1.0, array([[22866,    34],
       [   25, 23071]]))
--------------------------------------------------
('Random_Forest_Classifier', 100.0, 100.0, 100.0, 100.0, 1.0, array([[22900,     0],
       [    0, 23096]]))
--------------------------------------------------
('XGBoost_Classifier', 99.93, 99.88, 99.97, 99.93, 1.0, array([[22872,    28],
       [    6, 23090]]))
--------------------------------------------------


# ML Flow Integration

In [6]:
import json
import mlflow
import mlflow.sklearn
import warnings

warnings.filterwarnings("ignore")

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('Classifiers')

for i, element in enumerate(trained_classification):

    model_name = element[0]
    params = element[1]
    pipeline = element[2]
    report = reports_classification[i]

    with mlflow.start_run(run_name=model_name) as run:
        
        # -------------------------
        # Log Hyperparameters
        # -------------------------
        mlflow.log_params(params)

        # -------------------------
        # Log Classification Metrics
        # -------------------------
        mlflow.log_metrics({
            'Accuracy': float(report[1]),
            'Precision': float(report[2]),
            'Recall': float(report[3]),
            'F1_Score': float(report[4]),
            'Roc_Auc': float(report[5])
        })

        # -------------------------
        # Log Confusion Matrix
        # -------------------------
        try:
            cm = report[6].tolist()
            cm_file = f"Confusion_Matrix_{model_name}.json"

            with open(cm_file, 'w') as f:
                json.dump(cm, f)

            mlflow.log_artifact(cm_file)
            print(f"Logged Confusion Matrix for {model_name}")

        except Exception as e:
            print(f"Error logging confusion matrix for {model_name}: {e}")

        # -------------------------
        # Log Model
        # -------------------------
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="model",
            input_example=X_train.iloc[:1]
        )

        # -------------------------
        # Run ID
        # -------------------------
        run_id = run.info.run_id
        print(f"Logged {model_name} with Run ID: {run_id}")

print("Successfully Logged Classification Pipelines, Models, Metrics, and Parameters")

2025/12/12 04:57:25 INFO mlflow.tracking.fluent: Experiment with name 'Classifiers' does not exist. Creating a new experiment.


Logged Confusion Matrix for Logistic_Regression
Logged Logistic_Regression with Run ID: 017c087f179046f19abdbf80bcec70ad
üèÉ View run Logistic_Regression at: http://127.0.0.1:5000/#/experiments/349491822039082563/runs/017c087f179046f19abdbf80bcec70ad
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/349491822039082563




Logged Confusion Matrix for Random_Forest_Classifier
Logged Random_Forest_Classifier with Run ID: eb1290511f114bf196ded9e6829ea154
üèÉ View run Random_Forest_Classifier at: http://127.0.0.1:5000/#/experiments/349491822039082563/runs/eb1290511f114bf196ded9e6829ea154
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/349491822039082563




Logged Confusion Matrix for XGBoost_Classifier
Logged XGBoost_Classifier with Run ID: 9b8224e584644923822463287b02fbf4
üèÉ View run XGBoost_Classifier at: http://127.0.0.1:5000/#/experiments/349491822039082563/runs/9b8224e584644923822463287b02fbf4
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/349491822039082563
Successfully Logged Classification Pipelines, Models, Metrics, and Parameters


# Best Model Registry

In [7]:
model_name = "Random_Forest_Classifier"
run_id = "eb1290511f114bf196ded9e6829ea154"

model_uri = f"runs:/{run_id}/model"

mlflow.register_model(model_uri=model_uri, name=model_name)

Successfully registered model 'Random_Forest_Classifier'.
2025/12/12 04:58:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random_Forest_Classifier, version 1
Created version '1' of model 'Random_Forest_Classifier'.


<ModelVersion: aliases=[], creation_timestamp=1765537093775, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1765537093775, metrics=None, model_id=None, name='Random_Forest_Classifier', params=None, run_id='eb1290511f114bf196ded9e6829ea154', run_link='', source='models:/m-2a65833a5aff44dda120f01a4158bc4b', status='READY', status_message=None, tags={}, user_id='', version='1'>

In [8]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")


classifier_model = mlflow.pyfunc.load_model("models:/Random_Forest_Classifier@challenger")

print("Model loaded successfully")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model loaded successfully


In [9]:
import pandas as pd

sample = pd.read_csv("sample_clf.csv")

prediction = classifier_model.predict(sample)
print("Prediction:", prediction)

if prediction == 1:
    print('Good Investment')
else:
    print('Bad Investment')

Prediction: [1]
Good Investment
