In [1]:
import pandas as pd 
import matplotlib as plt

## Basic Data Cleaning 

In [None]:
df_sales = pd.read_csv("dubai_valuation.csv")
df_sales

Unnamed: 0,instance_date,actual_worth,row_status_code,property_type_en,property_sub_type_en,area_name_en,actual_area
0,30-04-2005,9750000.00,COMPLETED,Land,Commercial,Al Baraha,696.77
1,16-03-2009,39883500.00,COMPLETED,Land,Commercial,Al Barsha First,2470.20
2,18-04-2004,4000000.00,COMPLETED,Land,Warehouse,Ras Al Khor Industrial Second,4455.35
3,22-03-2006,3000000.00,COMPLETED,Land,Residential,Al Barsha Third,1393.55
4,8/5/05,26208000.00,COMPLETED,Land,Industrial,Al Garhoud,4869.60
...,...,...,...,...,...,...,...
83386,15-05-2025,2144443.00,COMPLETED,Unit,Flat,Al Merkadh,111.90
83387,15-05-2025,11000000.00,COMPLETED,Land,Residential,Nad Al Shiba Fourth,2322.58
83388,15-05-2025,2799996.00,COMPLETED,Land,Residential,Mirdif,929.03
83389,15-05-2025,2109414.72,COMPLETED,Unit,Flat,Marsa Dubai,77.66


In [None]:
#We are not dealing with Land only properties so remove all rows that have property type as land

df_sales = df_sales[df_sales["property_type_en"] != "Land"].reset_index(drop=True)
df_sales.index = range(1, len(df_sales) + 1)
df_sales

Unnamed: 0,instance_date,actual_worth,row_status_code,property_type_en,property_sub_type_en,area_name_en,actual_area
1,23-03-2010,2.200000e+08,COMPLETED,Building,Building,Burj Khalifa,18481.57
2,19-04-2010,1.800000e+06,COMPLETED,Building,Villa,Wadi Al Safa 6,232.48
3,11/2/10,1.400000e+06,COMPLETED,Building,Villa,Al Thanayah Fourth,172.31
4,15-02-2010,8.500000e+05,COMPLETED,Unit,Flat,Marsa Dubai,66.33
5,25-02-2010,7.500000e+05,COMPLETED,Unit,Flat,Marsa Dubai,85.38
...,...,...,...,...,...,...,...
24579,15-05-2025,1.573635e+06,COMPLETED,Unit,Flat,Al Khairan First,71.67
24580,15-05-2025,1.539247e+06,COMPLETED,Unit,Flat,Al Thanyah Fifth,110.28
24581,15-05-2025,2.144443e+06,COMPLETED,Unit,Flat,Al Merkadh,111.90
24582,15-05-2025,2.109415e+06,COMPLETED,Unit,Flat,Marsa Dubai,77.66


In [None]:
#I want to see what are all the property types we have right now
types_properties = df_sales["property_type_en"].value_counts()
sub_types_properties = df_sales["property_sub_type_en"].value_counts()
print(f"Property_type : \n{types_properties}")
print(f"Subtype_property : \n {sub_types_properties}")

Property_type : 
property_type_en
Unit        22267
Building     2316
Name: count, dtype: int64
Subtype_property : 
 property_sub_type_en
Flat                  16025
Office                 4581
Villa                  2262
Shop                    965
Hotel Apartment         354
Hotel Rooms             259
Building                 54
Warehouse                39
Workshop                 14
Sized Partition          11
Clinic                    7
Hotel                     7
Stacked Townhouses        3
Gymnasium                 2
Name: count, dtype: int64


In [None]:
'''
Since we are looking for properties for that correlate with office-commercial, normal living apartments, 
and luxuary apartments we can remove shop, warehoue, size partition, clinic, hotel,gymnasium ect.
'''
df_sales = df_sales[df_sales["property_sub_type_en"].isin(["Flat", "Office", "Villa", "Hotel Apartment", "Hotel Rooms", "Buildings"])].reset_index(drop = True)
#check if the other subcategories excluded
sub_types_properties = df_sales["property_sub_type_en"].value_counts()
print(f"Subtype_property : \n {sub_types_properties}")

Subtype_property : 
 property_sub_type_en
Flat               16025
Office              4581
Villa               2262
Hotel Apartment      354
Hotel Rooms          259
Name: count, dtype: int64


In [None]:
df_sales.index = range(1, len(df_sales) + 1)
df_sales

Unnamed: 0,instance_date,actual_worth,row_status_code,property_type_en,property_sub_type_en,area_name_en,actual_area
1,19-04-2010,1800000.00,COMPLETED,Building,Villa,Wadi Al Safa 6,232.48
2,11/2/10,1400000.00,COMPLETED,Building,Villa,Al Thanayah Fourth,172.31
3,15-02-2010,850000.00,COMPLETED,Unit,Flat,Marsa Dubai,66.33
4,25-02-2010,750000.00,COMPLETED,Unit,Flat,Marsa Dubai,85.38
5,25-03-2010,1350000.00,COMPLETED,Building,Villa,Al Thanayah Fourth,179.90
...,...,...,...,...,...,...,...
23477,15-05-2025,1573635.31,COMPLETED,Unit,Flat,Al Khairan First,71.67
23478,15-05-2025,1539247.12,COMPLETED,Unit,Flat,Al Thanyah Fifth,110.28
23479,15-05-2025,2144443.00,COMPLETED,Unit,Flat,Al Merkadh,111.90
23480,15-05-2025,2109414.72,COMPLETED,Unit,Flat,Marsa Dubai,77.66


In [None]:
#Another major issue is the instance_data is not consistent so will convert all to mm/dd/yyyy format.
df_sales["instance_date"] = pd.to_datetime(df_sales["instance_date"],errors = "coerce",infer_datetime_format=True) #Letting pandas determine which time format it is in
df_sales["instance_date"] = df_sales["instance_date"].dt.strftime("%m/%d/%Y")
df_sales

  df["instance_date"] = pd.to_datetime(df["instance_date"],errors = "coerce",infer_datetime_format=True) #Letting pandas determine which time format it is in
  df["instance_date"] = pd.to_datetime(df["instance_date"],errors = "coerce",infer_datetime_format=True) #Letting pandas determine which time format it is in


Unnamed: 0,instance_date,actual_worth,row_status_code,property_type_en,property_sub_type_en,area_name_en,actual_area
1,04/19/2010,1800000.00,COMPLETED,Building,Villa,Wadi Al Safa 6,232.48
2,,1400000.00,COMPLETED,Building,Villa,Al Thanayah Fourth,172.31
3,02/15/2010,850000.00,COMPLETED,Unit,Flat,Marsa Dubai,66.33
4,02/25/2010,750000.00,COMPLETED,Unit,Flat,Marsa Dubai,85.38
5,03/25/2010,1350000.00,COMPLETED,Building,Villa,Al Thanayah Fourth,179.90
...,...,...,...,...,...,...,...
23477,05/15/2025,1573635.31,COMPLETED,Unit,Flat,Al Khairan First,71.67
23478,05/15/2025,1539247.12,COMPLETED,Unit,Flat,Al Thanyah Fifth,110.28
23479,05/15/2025,2144443.00,COMPLETED,Unit,Flat,Al Merkadh,111.90
23480,05/15/2025,2109414.72,COMPLETED,Unit,Flat,Marsa Dubai,77.66


In [None]:
#now lets look into status_code to see what work would be incomplete
df_sales_status_count = df_sales["row_status_code"].value_counts()
df_sales_status_count

row_status_code
COMPLETED    23463
COMMITTED       10
ENTERED          8
Name: count, dtype: int64

In [None]:
#We can see that most of the data points that are not corresponding to completed status are projects 
#initiated in recent years with a few null value (16.67%)
df_status_not_completed = df_sales[df_sales["row_status_code"].isin(["COMMITTED","ENTERED"])].reset_index(drop=True)
df_sales_status_not_completed.index = range(1, len(df_status_not_completed) + 1)
df_status_not_completed

Unnamed: 0,instance_date,actual_worth,row_status_code,property_type_en,property_sub_type_en,area_name_en,actual_area
1,,908473.0,ENTERED,Unit,Office,Business Bay,105.5
2,,2669.0,ENTERED,Unit,Office,Al Thanyah Fifth,248.05
3,11/20/2023,3650000.0,ENTERED,Unit,Flat,Al Hebiah First,488.79
4,07/22/2024,1200000.0,ENTERED,Unit,Flat,Business Bay,77.03
5,09/30/2024,2500000.0,COMMITTED,Unit,Flat,Marsa Dubai,153.59
6,12/26/2024,55000000.0,COMMITTED,Building,Villa,World Islands,1699.43
7,11/19/2023,2500000.0,COMMITTED,Unit,Flat,Zaabeel Second,111.28
8,07/15/2024,3000000.0,ENTERED,Unit,Hotel Rooms,World Islands,144.09
9,12/22/2023,2150000.0,ENTERED,Building,Villa,Al Thanayah Fourth,207.08
10,04/23/2025,35.0,ENTERED,Unit,Flat,Al Barsha South Fourth,35.77
