
# Predictive Modeling Challenge

**Objective:** Predict property prices based on cleaned and feature-engineered real estate listings.


# Imports & Load Data

In [6]:
import pandas as pd
import numpy as np

# Load deep-cleaned data with core features
df = pd.read_csv('feature_engineered_listings_v2.csv')
df.head()

Unnamed: 0,title,price,area,bathrooms,bedrooms,log_price,log_area,price_per_sqft,area_sqm,price_per_sqm,...,"loc_sumail, al dakhiliya","loc_sur, al sharqiya","loc_suwaiq, al batinah","loc_taqah, dhofar","loc_the wave (almouj), muscat","loc_wadi kabir, muscat, oman","loc_wattayyah, muscat, oman","loc_yiti, muscat","loc_yunqul, al dhahirah",bedrooms_enc
0,adv905*4bhk villa for rent in madinat illam in...,2.125476,0.063393,4.0,4.0,6.621406,5.70711,-0.268123,27.8709,-0.268123,...,False,False,False,False,False,False,False,False,False,3.0
1,*adv705** 3+1 bhk villa for rent in bousher –a...,1.642636,0.12945,4.0,3.0,6.47851,5.860786,-0.373325,32.51605,-0.373325,...,False,False,False,False,False,False,False,False,False,2.0
2,4 br + maid’s room spacious well-designed vill...,2.125476,0.247031,5.0,4.0,6.621406,6.086775,-0.397662,40.784417,-0.397662,...,False,False,False,False,False,False,False,False,False,3.0
3,2 br beautiful apartment with panoramic views ...,0.580389,-0.175733,2.0,2.0,6.066108,4.787492,-0.085909,11.055457,-0.085909,...,False,False,False,False,False,False,False,False,False,1.0
4,2 br + 1 bedroom brand new apartment in shatti...,1.039087,-0.175733,3.0,2.0,6.265301,4.787492,0.044735,11.055457,0.044735,...,False,False,False,False,False,False,False,False,False,1.0



## Data Preparation

- **Target:** `price`  
- **Features:** Numeric (`area`, `bedrooms`, `bathrooms`, `price_per_sqft` if exists) and categorical (`location`).
- Split into training and test sets.
- Handle categorical variables and scale numeric features via pipelines.


# Prepare Dataset

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Select target and features
target = 'price'
features = ['area', 'bedrooms', 'bathrooms']
if 'price_per_sqft' in df.columns:
    features.append('price_per_sqft')
if 'location' in df.columns:
    cat_features = ['location']
else:
    cat_features = []

X = df[features + cat_features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocessing pipeline
numeric_features = features
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, cat_features)
])



## Model Implementation

Train three models:
1. Linear Regression  
2. Decision Tree Regressor  
3. Random Forest Regressor  


# Train Models

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42)
}

pipelines = {}
for name, model in models.items():
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('estimator', model)
    ])
    pipe.fit(X_train, y_train)
    pipelines[name] = pipe
    print(f"Trained {name}")

Trained LinearRegression
Trained DecisionTree
Trained RandomForest



## Model Evaluation

Evaluate using Mean Squared Error (MSE) and R² score on the test set.


# Evaluate Models

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

results = []
for name, pipe in pipelines.items():
    preds = pipe.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results.append({'Model': name, 'MSE': mse, 'R2': r2})

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,MSE,R2
0,LinearRegression,0.738197,0.260984
1,DecisionTree,0.012551,0.987435
2,RandomForest,0.006491,0.993502
