# Predictive Modeling 
This notebook demonstrates setting up and evaluating regression models to predict property prices.

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

## 2. Load Engineered Dataset

In [2]:
df = pd.read_csv('engineered_listings.csv')
df.head()

Unnamed: 0,title,price,location,area,bathrooms,bedrooms,price_per_sqm,total_rooms,district,district_encoded
0,ADV905*4BHK Villa for rent in Madinat Illam in...,750.0,"Qurum, Muscat•",300.0,4.0,4.0,2.5,8.0,Qurum,66
1,ADV906**4BHK Villa in a comples in shatti qurum,1900.0,"Qurum, Muscat•",300.0,4.0,4.0,6.333333,8.0,Qurum,66
2,ADC507*** Office Space in Azaiba – 440 sqm for...,2310.0,"Azaiba, Muscat•",440.0,2.0,2.0,5.25,4.0,Azaiba,30
3,*ADV705** 3+1 BHK Villa for Rent in Bousher –A...,650.0,"Bosher, Muscat•",350.0,4.0,3.0,1.857143,7.0,Bosher,39
4,4 BR + Maid’s Room Spacious Well-Designed Vill...,750.0,"Azaiba, Muscat•",439.0,5.0,4.0,1.708428,9.0,Azaiba,30


## 3. Prepare Features and Target

In [3]:
# Define target and features
target = 'price'
features = ['area', 'total_rooms', 'district_encoded']

X = df[features]
y = df[target]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 2822
Testing samples: 706


## 4. Train and Evaluate Models

In [4]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, preds)
    print(f"{name} — RMSE: {rmse:.2f}, R2: {r2:.2f}")
    results.append({'Model': name, 'RMSE': rmse, 'R2': r2})

Linear Regression — RMSE: 4675.03, R2: -4.06
Decision Tree — RMSE: 1988.07, R2: 0.09
Random Forest — RMSE: 1871.62, R2: 0.19


## 5. Compare Model Performance

In [5]:
import pandas as pd
pd.DataFrame(results)

Unnamed: 0,Model,RMSE,R2
0,Linear Regression,4675.02626,-4.055807
1,Decision Tree,1988.071695,0.085705
2,Random Forest,1871.619377,0.189678


## 6. Feature Importance (Random Forest)

In [6]:
rf = models['Random Forest']
importances = rf.feature_importances_
for feat, imp in zip(features, importances):
    print(f"{feat}: {imp:.2f}")

area: 0.84
total_rooms: 0.06
district_encoded: 0.11
