In [47]:
#importing necessay libs
import pandas as pd
import numpy as np 
import warnings

warnings.filterwarnings("ignore")

In [48]:
#loading Datasets
dataframe=pd.read_csv(r"P:\webscraper-UAE-housing\Dataset\uae-housing_dataset.csv")
dataframe.head()

Unnamed: 0,price,bedroom,bathroom,area(sqft),country,city,address,propert_type,purpose,furnishing,completion_status,handover,project_name
0,1800000,3 beds,4 baths,"1,208 sqft",UAE,Dubai,DAMAC Hills 2 (Akoya by DAMAC),Townhouse,Sale,Unfurnished,Off-Plan,Q2 2025,Camelia Villas
1,325000,Studio,1 bath,483 sqft,UAE,Dubai,International City,Apartment,Sale,Unfurnished,Ready,Q2 2025,Camelia Villas
2,470000,Studio,1 bath,428 sqft,UAE,Dubai,Arjan,Apartment,Sale,Furnished,Ready,Q2 2025,Camelia Villas
3,1120000,1 bed,1 bath,706 sqft,UAE,Dubai,Dubai South,Apartment,Sale,Unfurnished,Off-Plan,Q3 2029,Camelia Villas
4,2127888,2 beds,2 baths,"1,149 sqft",UAE,Dubai,Dubai South,Apartment,Sale,Unfurnished,Off-Plan,Q3 2029,Camelia Villas


In [49]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5378 entries, 0 to 5377
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   price              5378 non-null   object
 1   bedroom            5378 non-null   object
 2   bathroom           5378 non-null   object
 3   area(sqft)         5378 non-null   object
 4   country            5378 non-null   object
 5   city               5378 non-null   object
 6   address            5378 non-null   object
 7   propert_type       5378 non-null   object
 8   purpose            5378 non-null   object
 9   furnishing         5378 non-null   object
 10  completion_status  5378 non-null   object
 11  handover           5378 non-null   object
 12  project_name       5378 non-null   object
dtypes: object(13)
memory usage: 546.3+ KB


In [50]:
dataframe=dataframe.drop_duplicates(keep="first",ignore_index=True)

In [51]:
def clean_bedroom(bedroom:pd.Series):
    bedroom=bedroom.apply(lambda x:x.lower())
    for i in range(len(dataframe['bedroom'])):
        if "bed" in bedroom[i]:
            bedroom[i]=bedroom[i].split()[0]
        elif "studio" in bedroom[i]:
            bedroom[i]=float(0.5)

    return bedroom    

In [52]:
def quater_to_float(handover_data):
    """this will convert your handover data into float"""
    q,y=handover_data.split()
    q=int(q[1])
    return  int(y) + (q-1)*0.25

In [53]:
dataframe['price']=dataframe['price'].str.replace(",","").astype(int)
dataframe["bedroom"]=clean_bedroom(dataframe['bedroom']).astype(float)
dataframe['bathroom']=(dataframe['bathroom'].str.split().str[0]).astype(int)
dataframe['area(sqft)']=(dataframe['area(sqft)'].str.split().str[0]).str.replace(",","").astype(float)
dataframe['handover']=dataframe['handover'].apply(quater_to_float)

In [54]:
dataframe.head()

Unnamed: 0,price,bedroom,bathroom,area(sqft),country,city,address,propert_type,purpose,furnishing,completion_status,handover,project_name
0,1800000,3.0,4,1208.0,UAE,Dubai,DAMAC Hills 2 (Akoya by DAMAC),Townhouse,Sale,Unfurnished,Off-Plan,2025.25,Camelia Villas
1,325000,0.5,1,483.0,UAE,Dubai,International City,Apartment,Sale,Unfurnished,Ready,2025.25,Camelia Villas
2,470000,0.5,1,428.0,UAE,Dubai,Arjan,Apartment,Sale,Furnished,Ready,2025.25,Camelia Villas
3,1120000,1.0,1,706.0,UAE,Dubai,Dubai South,Apartment,Sale,Unfurnished,Off-Plan,2029.5,Camelia Villas
4,2127888,2.0,2,1149.0,UAE,Dubai,Dubai South,Apartment,Sale,Unfurnished,Off-Plan,2029.5,Camelia Villas


In [55]:
#cleaing the dataset
dataframe=dataframe.drop(columns=["country","purpose",'city'])
dataframe=dataframe.drop_duplicates(keep="first")

In [56]:
#manual encoding
maping_furnishing={'Unfurnished':0,'Furnished':1}
maping_completion_status={'Off-Plan':0,'Ready':1}

dataframe['furnishing']=dataframe['furnishing'].map(maping_furnishing)
dataframe['completion_status']=dataframe['completion_status'].map(maping_completion_status)

project_maping=dataframe['project_name'].value_counts()
dataframe['project_name']=dataframe['project_name'].map(project_maping)

In [57]:
y=dataframe["price"]
X=dataframe.drop(columns="price")

In [58]:
catge=[feature for feature in X.columns if X[feature].dtype =="O"]

for feature in catge:
    X[feature]=X[feature].apply(lambda x: x.lower())

In [59]:
#log transformation..
y=np.log(y+1)
X['area(sqft)']=np.log(X['area(sqft)']+1)

In [60]:
from sklearn.model_selection import train_test_split

#spliting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [61]:
X=X[X['propert_type']!="residential building"]
X['propert_type']=X['propert_type'].replace({"villa compound":"villa"})
X=X[X['area(sqft)']!=13149888.0]

In [62]:
X['address']=X['address'].str.replace(" ","")
vc_address=dict(X['address'].value_counts())
def rearrange_address(address):
    global vc_address

    result=vc_address.get(address)
    if result <=5:
        address ="others"
    else :
        pass
    return address

X['address']=X['address'].apply(rearrange_address)

In [63]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder


ct=ColumnTransformer([
    ("OHE",OneHotEncoder(sparse_output=True,handle_unknown="ignore"),["address","propert_type"]),
    ("minmax_scaler",MinMaxScaler(feature_range=(0,1)),["bedroom","bathroom","handover","project_name"])
],remainder="passthrough")

ct.fit(X_train)

X_train=ct.transform(X_train)
X_test=ct.transform(X_test)

X_train=X_train.toarray()
X_test=X_test.toarray()


In [64]:
from sklearn.linear_model import LinearRegression ,Ridge ,Lasso
from sklearn.tree import DecisionTreeRegressor

models={
    "linear regresson":LinearRegression(),
    "ridge":Ridge(),
    "lasso":Lasso(),
    "decision tree regressor" : DecisionTreeRegressor()
}

In [65]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

def test_acc(y_test,y_pred):
    r2score=r2_score(y_test,y_pred)
    mae=mean_absolute_error(y_test,y_pred)
    mse=mean_squared_error(y_test,y_pred)

    return r2score,mae,mse

In [66]:
def model_test(X_train,X_test,y_train,y_test,models):
    models_list = []
    accuracy_list = []
    auc= []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) 

        # Make predictions
        y_pred = model.predict(X_test)

        r2score,mae,mse=test_acc(y_test,y_pred)
        models_list.append(list(models.keys())[i])

        accuracy_list.append(r2score)

    report=pd.DataFrame(list(zip(models_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'], ascending=False)

    return report

In [67]:
model_test(X_train,X_test,y_train,y_test,models)

Unnamed: 0,Model Name,Accuracy
1,ridge,0.8779871
3,decision tree regressor,0.8570622
2,lasso,-4.257858e-06
0,linear regresson,-1.777656e+22


In [24]:
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def test_acc(y_true, y_pred):
    return r2_score(y_true, y_pred), mean_absolute_error(y_true, y_pred), mean_squared_error(y_true, y_pred)

def model_test(X_train, X_test, y_train, y_test, models):
    model_names = []
    r2_list = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2score, mae, mse = test_acc(y_test, y_pred)
        model_names.append(name)
        r2_list.append(r2score)

    report = pd.DataFrame({
        'Model Name': model_names,
        'R2 Score': r2_list
    }).sort_values(by='R2 Score', ascending=False)

    return report


In [25]:
model_test(X_train,X_test,y_train,y_test,models)

Unnamed: 0,Model Name,R2 Score
1,ridge,0.8779871
3,decision tree regressor,0.8575458
2,lasso,-4.257858e-06
0,linear regresson,-1.777656e+22


In [72]:
from sklearn.preprocessing import PowerTransformer,StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge,LogisticRegression,ElasticNet,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score,mean_absolute_error,root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor,StackingRegressor



# Models to evaluate
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    Ridge(),
    Lasso(),
    ElasticNet(),
    RandomForestRegressor(),



]

# Metrics to evaluate
accuracy_metrics = [mean_absolute_error, root_mean_squared_error, r2_score]


for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    for metric in accuracy_metrics:
        score = metric(y_test, y_pred)
        result = {"model":model.__class__.__name__,"metric":metric.__name__,"score":score}
        print(f'The {metric.__name__} of {model.__class__.__name__} is: {score}')
    print('-----------------------------------------------------------------')
        

The mean_absolute_error of LinearRegression is: 5836499162.703476
The root_mean_squared_error of LinearRegression is: 120222387855.16335
The r2_score of LinearRegression is: -1.7776559438474733e+22
-----------------------------------------------------------------
The mean_absolute_error of DecisionTreeRegressor is: 0.2131549673061211
The root_mean_squared_error of DecisionTreeRegressor is: 0.33683143919199593
The r2_score of DecisionTreeRegressor is: 0.8604588647774938
-----------------------------------------------------------------
The mean_absolute_error of Ridge is: 0.19635098298259757
The root_mean_squared_error of Ridge is: 0.31496648029629276
The r2_score of Ridge is: 0.8779871182602517
-----------------------------------------------------------------
The mean_absolute_error of Lasso is: 0.7012380977517022
The root_mean_squared_error of Lasso is: 0.9017007265184392
The r2_score of Lasso is: -4.257858144729454e-06
-----------------------------------------------------------------


## Observation from the result:
#### *Model Seletected (Decision Tree Regressor - Ridge Regressor - Random Forest Regresson)*

## *Hyper paramnet tuning*

In [73]:
from sklearn.model_selection import GridSearchCV

### * *Decision Tree Regressor*

In [76]:
params_grid={
    "max_depth":[1,2,3,5,10,15,20,None],
    "min_samples_split":[1,2,5,10],
    "min_samples_leaf":[1,2,3,5],
    "criterion":["squared_error","absolute_error","poisson","friedman_mse"]
}

grid_search_dt=GridSearchCV(estimator=DecisionTreeRegressor(),param_grid=params_grid,cv=8)

grid_search_dt.fit(X_train,y_train)

print("Best Parameters:",grid_search_dt.best_params_)
print("Best Score:",grid_search_dt.best_score_)

Best Parameters: {'criterion': 'absolute_error', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best Score: 0.868164577219623


### * *Ridge Regressor*

In [81]:
params_grid={
    'alpha': [0.01, 0.1, 1, 10, 100, 1000]
}

grid_search_rr=GridSearchCV(estimator=Ridge(),cv=8,param_grid=params_grid,scoring="r2")

grid_search_rr.fit(X_train,y_train)

print("Best alpha:", grid_search_rr.best_params_['alpha'])
print("Best R² score:", grid_search_rr.best_score_)

Best alpha: 0.1
Best R² score: 0.9263135545995613


### * *Random Forest Regressor*

In [84]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300, 500, 800],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42),param_distributions=param_dist,n_iter=50,cv=5,verbose=1,random_state=42,n_jobs=-1,scoring='r2')

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best R² Score:", random_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
Best R² Score: 0.916495451075779


## CHANGES

In [26]:
df=dataframe.copy()

In [27]:
df=df[df['propert_type']!="residential building"]
df['propert_type']=df['propert_type'].replace({"villa compound":"villa"})
df=df[df['area(sqft)']!=13149888.0]





In [28]:
df['address']=df['address'].str.replace(" ","")
vc_address=dict(df['address'].value_counts())
def rearrange_address(address):
    global vc_address

    result=vc_address.get(address)
    if result <=5:
        address ="others"
    else :
        pass
    return address

df['address']=df['address'].apply(rearrange_address)

In [29]:
project_maping=df['project_name'].value_counts()
df['project_name']=df['project_name'].map(project_maping)

In [30]:
df.head()

Unnamed: 0,price,bedroom,bathroom,area(sqft),address,propert_type,furnishing,completion_status,handover,project_name
0,1800000,3.0,4,1208.0,DAMACHills2(AkoyabyDAMAC),Townhouse,0,0,2025.25,203
1,325000,0.5,1,483.0,InternationalCity,Apartment,0,1,2025.25,203
2,470000,0.5,1,428.0,Arjan,Apartment,1,1,2025.25,203
3,1120000,1.0,1,706.0,DubaiSouth,Apartment,0,0,2029.5,203
4,2127888,2.0,2,1149.0,DubaiSouth,Apartment,0,0,2029.5,203


In [31]:
df['area(sqft)']=np.log(df['area(sqft)']+1)
df['price']=np.log(df['price']+1)

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y1=df["price"]
X1=df.drop(columns="price")

#spliting the dataset
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X1, y1, test_size=0.33, random_state=42)

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler


ct=ColumnTransformer([
    ("OHE",OneHotEncoder(sparse_output=True,handle_unknown="ignore"),["address","propert_type"]),
    ("minmax_scaler",MinMaxScaler(feature_range=(0,1)),["bedroom","bathroom","handover","project_name"])
],
                     remainder="passthrough")

ct.fit(X_train1)

X_train1=ct.transform(X_train1)
X_test1=ct.transform(X_test1)

X_train1=X_train1.toarray()
X_test1=X_test1.toarray()


In [34]:
from sklearn.linear_model import LinearRegression
lr1=LinearRegression()

lr1.fit(X_train1,y_train1)

y_pred1=lr1.predict(X_test1)

from sklearn.metrics import r2_score ,mean_absolute_error,mean_squared_error
acc11=r2_score(y_test1,y_pred1)
mse11=mean_squared_error(y_test1,y_pred1)
mae11=mean_absolute_error(y_test1,y_pred1)

print(f"r2score: {acc11}, mse: {mse11} , mae: {mae11}")

r2score: 0.9090549061742248, mse: 0.07503147263190692 , mae: 0.19746849792290078
