## Data Inspection

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Import dataset
properties = pd.read_csv("../raw_data.csv")

In [3]:
# Inspect dataset
properties.shape

(26147, 17)

In [4]:
properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26147 entries, 0 to 26146
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   zip_code             26147 non-null  int64  
 1   commune              26147 non-null  object 
 2   province             26147 non-null  object 
 3   type_of_property     26147 non-null  int64  
 4   subtype_of_property  26147 non-null  object 
 5   price                26111 non-null  float64
 6   building_condition   19457 non-null  object 
 7   facade_number        16785 non-null  float64
 8   living_area          26147 non-null  int64  
 9   equipped_kitchen     26147 non-null  object 
 10  bedroom_nr           26147 non-null  int64  
 11  swimming_pool        26147 non-null  int64  
 12  furnished            26147 non-null  int64  
 13  open_fire            26147 non-null  int64  
 14  terrace              26147 non-null  int64  
 15  garden               26147 non-null 

In [5]:
properties.value_counts("zip_code")

zip_code
8300    918
1000    626
1070    564
9000    564
1180    548
       ... 
7024      1
7033      1
7034      1
4287      1
9970      1
Name: count, Length: 875, dtype: int64

In [6]:
properties.value_counts("commune")

commune
Westkapelle     918
Antwerpen       692
Bruxelles       626
Gent            564
Anderlecht      564
               ... 
Bilstain          1
Binkom            1
Basse-Bodeux      1
Baulers           1
Zillebeke         1
Name: count, Length: 849, dtype: int64

In [7]:
properties.value_counts("province")

province
Bruxelles          5127
West-Vlaanderen    4964
Oost-Vlaanderen    4364
Antwerpen          3194
Hainaut            2157
Liège              2147
Vlaams Brabant     1796
Brabant Wallon      980
Luxembourg          938
Limburg             480
Name: count, dtype: int64

In [8]:
properties.value_counts("garden")

garden
0        20949
100        138
1          111
200        101
50          99
         ...  
634          1
11500        1
6900         1
6996         1
7600         1
Name: count, Length: 948, dtype: int64

In [9]:
properties.value_counts("bedroom_nr")

bedroom_nr
2     9304
3     7974
1     3251
4     2973
5     1025
0      890
6      427
7      143
8       76
9       26
10      18
16       9
12       8
13       8
11       6
20       3
24       2
15       1
14       1
18       1
25       1
Name: count, dtype: int64

In [10]:
properties.value_counts("type_of_property")

type_of_property
0    15843
1    10304
Name: count, dtype: int64

In [11]:
properties.value_counts("plot_surface")

plot_surface
0        16290
200         74
120         66
150         58
250         56
         ...  
23159        1
23290        1
23503        1
23912        1
24091        1
Name: count, Length: 2192, dtype: int64

In [12]:
properties.describe()

Unnamed: 0,zip_code,type_of_property,price,facade_number,living_area,bedroom_nr,swimming_pool,furnished,open_fire,terrace,garden,plot_surface
count,26147.0,26147.0,26111.0,16785.0,26147.0,26147.0,26147.0,26147.0,26147.0,26147.0,26147.0,26147.0
mean,5085.507821,0.39408,459494.3,2.720167,143.557425,2.599648,0.019429,0.032088,0.007458,12.193942,102.666692,407.083681
std,3283.479987,0.488661,435517.3,0.847971,97.351713,1.370721,0.138029,0.176237,0.086038,43.31655,676.906691,2650.878092
min,1000.0,0.0,27500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1743.5,0.0,265000.0,2.0,88.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4480.0,0.0,350000.0,2.0,116.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0
75%,8460.0,1.0,492100.0,3.0,170.0,3.0,0.0,0.0,0.0,15.0,0.0,255.0
max,9992.0,1.0,8900000.0,15.0,992.0,25.0,1.0,1.0,1.0,3749.0,50000.0,160737.0


## Data Cleaning

### Notes on missing values and zero values:

**Missing Values:**

* "price" column: 36 entries missing                **-> Drop**
* "building_condition": 6690 entries missing        **-> Use mode? Encode as numbers and fill with average?**
* "facade_number": 9362 entries missing             **-> Use median based on subtype of property**

**Zero Values:**

* "living_area": 5 entries with 0                   **-> Drop**
* "equipped_kitchen":                               **-> Does 0 mean no?**
* "bedroom_nr": 890 entries with 0                  **-> Take average based on living_area?**
* "garden": 20949 entries with 0                    **-> apr. 20% of properties have a garden -> matches filter result in immoweb** 
                                                    **-> in our dataset apr. 50%, but we had 60% houses, here 39% houses**
* "plot_surface": 16290 entries with 0              **-> what was the definition?** 


**Other:**
* "terrace":                               **->0 means no terrace, and number = surface?**
* "garden":                                **->same as terrace?**

In [13]:
# See summary of missing values
properties.isna().sum()

zip_code                  0
commune                   0
province                  0
type_of_property          0
subtype_of_property       0
price                    36
building_condition     6690
facade_number          9362
living_area               0
equipped_kitchen          0
bedroom_nr                0
swimming_pool             0
furnished                 0
open_fire                 0
terrace                   0
garden                    0
plot_surface              0
dtype: int64

### Calculate missing value threshold and drop entries below or equal to

In [14]:
# Calculate missing value threshold
threshold = len(properties) * 0.05
print(len(properties))
print(int(threshold))

# Use Boolean indexing to filter for columns with missing values <= threshold and > 0
cols_to_drop = properties.columns[(properties.isna().sum() <= threshold) & (properties.isna().sum() > 0)]

print(cols_to_drop)

# To drop missing values
properties.dropna(subset=cols_to_drop, inplace=True)

26147
1307
Index(['price'], dtype='object')


### Drop zero values in "living_area"


In [None]:
# Drop rows where 'living_area' is 0
properties = properties[properties["living_area"] != 0]

### Handling missing values in "building_condition"

Options:

* Fill with most frequent category (mode)

* Fill with a placeholder (e.g. "Unknown")

#### Fill zero values in "building_condition" with mode

In [15]:
mode_value = properties['building_condition'].mode()[0]
mode_value

'good'

In [16]:
properties['building_condition'] = properties['building_condition'].fillna(mode_value)

### Replace missing values in "facade_number" with median based on subtype of property

In [None]:
# Compute median facade number by subtype
facade_dict = properties.groupby("subtype_of_property")["facade_number"].median().to_dict()

# Impute values
properties["facade_number"] = properties["facade_number"].fillna(properties["subtype_of_property"].map(facade_dict))

In [None]:
properties.isna().sum()

## Check for duplicates

In [None]:
# Check for duplicates and count them
num_duplicates = properties.duplicated().sum()
num_duplicates

In [None]:
# Remove duplicate rows and create cleaned df
properties_unique = properties.drop_duplicates()

In [None]:
properties_unique.shape

### Notes on duplicates:

* 1284 duplicates found

## Check for blank spaces