# IMMOWEB PROJECT: EXPLORATORY DATA ANALYSIS

## LIBRARIES AND SETTINGS

In [1]:
# Set the notebook to show all outputs in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from ydata_profiling import ProfileReport

# This is for reading the locality name properly. In avoiding the encoding error
import csv
import os

import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Pandas options for data wrangling and output set-up 
import pandas as pd
pd.set_option('display.max_columns', None) # display all columns
pd.set_option('display.expand_frame_repr', False) # print all columns and in the same line
pd.set_option('display.max_colwidth', None) # display the full content of each cell
pd.set_option('display.float_format', lambda x: '%.2f' %x) # floats to be displayed with 2 decimal places

## FUNCTIONS USED RECURRENTLY

In [2]:
# Function to run basic data frame description
def Descriptives(df):
    print("Looking at the shape to see the number of records:", df.shape)
    print("\nDescription of the file to check values range:")
    print(df.describe().transpose())
    print("\nAttributes and respective data types:")
    print(df.info())

def MemOptimisation(df):
    print(f"\nAmount of memory used by all attributes: {df.memory_usage(deep=True).sum()}\n")
    
    # Optimise memory usage
    for i in categoric_cols:
        df[i] = df[i].astype('category')
    for i in numeric_cols:
        df[i] = pd.to_numeric(df[i], downcast='integer')
        df[i] = pd.to_numeric(df[i], downcast='float')    

    print(df.info(memory_usage='deep'))
    print("\nAmount of memory used now by all attributes: ",df.memory_usage(deep=True).sum())
    
# Function to check the missing values (NaNs)
def Missing(df):
    try:
        assert df.notnull().all().all()
        print("Good news! There are no missing values.")
    
    except AssertionError:
        print("Count of missing values:")
        print(df.isna().sum())
        print("\nPercentage of missing values:")
        print(df.isna().mean().round(4)*100, "\n")



## START OF ANALYSIS: LOAD DATA & CLEAN THE VARIABLES

In [3]:
# Load the data (CSV file)
df = pd.read_csv("Data/immoweb-dataset.csv")

In [4]:
# Ensure no leading/trailing spaces in the column names
df.columns = [col.strip() for col in df.columns]

In [5]:
# Show information about the data
Descriptives(df)

Looking at the shape to see the number of records: (80368, 53)

Description of the file to check values range:
                            count        mean        std        min         25%         50%         75%         max
Unnamed: 0               80368.00   209681.71  105460.84       0.00   181965.75   235124.50   257006.25   446550.00
id                       80368.00 19529221.31 2897688.01 1882546.00 20350553.25 20529847.50 20614940.75 20664387.00
bedroomCount             73558.00        3.01       1.83       1.00        2.00        3.00        4.00      100.00
bathroomCount            66672.00        1.43       4.56       1.00        1.00        1.00        2.00     1146.00
postCode                 80368.00     5266.59    3045.70    1000.00     2330.00     4830.00     8420.00     9992.00
habitableSurface         67783.00      178.50     820.88       6.00       93.00      140.00      200.00   185347.00
roomCount                21948.00        7.11       5.82       1.00        1.

### TARGET VARIABLE: PRICE

In [6]:
# Check the missing values in price
Missing(df['price'])

Count of missing values:
3998

Percentage of missing values:
4.97 



In [7]:
# Price is the target variable, we cannot input a value for the missing values
# as we cannot check the reliability of the ML model againt an inputted value: model would be biased
df = df.dropna(subset=['price'])

### MISSING VALUES OVERALL

In [8]:
Missing(df)

Count of missing values:
Unnamed: 0                      0
id                              0
url                             0
type                            0
subtype                         0
bedroomCount                 2820
bathroomCount                9705
province                        0
locality                        0
postCode                        0
habitableSurface             8590
roomCount                   54423
monthlyCost                 76370
hasAttic                    63856
hasBasement                 47057
hasDressingRoom             73742
diningRoomSurface           69471
hasDiningRoom               62231
buildingCondition           18381
buildingConstructionYear    27268
facedeCount                 23136
floorCount                  38827
streetFacadeWidth           60860
hasLift                     57327
floodZoneType               33466
heatingType                 29220
hasHeatPump                 68897
hasPhotovoltaicPanels       68322
hasThermicPanels       

In [9]:
# We see there are variables with 100% missing values, so nothing to do with them: DROP
# Start in list with all variables to drop
dropVar = []
dropVar = df.columns[df.isna().sum() == len(df)].tolist()
dropVar

['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople']

### UNNAMED

In [10]:
# This is an index variable, as presumabily the data comes from a dataframe that was saved as CSV keeping the index column
dropVar += ['Unnamed: 0']
dropVar

['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople', 'Unnamed: 0']

### ID & URL

In [11]:
# These variables have no value for predicting price: they can be dropped
# But, first checking if there are duplicated records
df.duplicated(subset=['id','url'], keep=False).sum()

0

In [12]:
dropVar += ['id', 'url']
dropVar


['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url']

### TYPE & SUBTYPE

In [13]:
# Checking the different values in type
df.type.value_counts()

type
HOUSE        45228
APARTMENT    31142
Name: count, dtype: int64

In [14]:
ad_types = df[['type', 'subtype']]
ad_types.groupby(['type', 'subtype']).value_counts()

type       subtype             
APARTMENT  APARTMENT               23844
           DUPLEX                   1691
           FLAT_STUDIO              1381
           GROUND_FLOOR             1771
           KOT                       331
           LOFT                      259
           PENTHOUSE                1265
           SERVICE_FLAT              449
           TRIPLEX                   151
HOUSE      APARTMENT_BLOCK          2454
           BUNGALOW                  427
           CASTLE                     68
           CHALET                    166
           COUNTRY_COTTAGE           309
           EXCEPTIONAL_PROPERTY      813
           FARMHOUSE                 289
           HOUSE                   33971
           MANOR_HOUSE                46
           MANSION                   544
           MIXED_USE_BUILDING       1840
           OTHER_PROPERTY             54
           PAVILION                    1
           TOWN_HOUSE                477
           VILLA         

In [15]:
# Looks correct
# Apply sentence case to remain consistent with the rest of string variables
df[['type', 'subtype']] = df[['type', 'subtype']].apply(lambda col: col.str.strip().str.capitalize())

In [16]:
Missing(df[['type', 'subtype']])

Good news! There are no missing values.


The variable subtype shows many different values (high cardinality)
When developping ML model, we should consider grouping to reduce cardinality

### GEO VARIABLES: PROVINCE, LOCALITY, POSTCODE & REGION

In [17]:
# Create a function to assign Regions to the locality
def map_region(row):
    loc = str(row["locality"]).strip()
    prov = row["province"]
    if loc in german_towns:
        return "German-speaking Community"
    if prov == "Brussels":
        return "Brussels"
    if prov in flemish:
        return "Flanders"
    if prov in walloon:
        return "Wallonia"
    return "Unknown"

In [18]:
# Add Region column
flemish = ["Antwerp", "Limburg", "East Flanders", "Flemish Brabant", "West Flanders"]
walloon = ["Hainaut", "Liège", "Luxembourg", "Namur", "Walloon Brabant"]
german_towns = [
        "Eupen", "Kelmis", "Raeren", "Lontzen", "Bütgenbach",
        "Büllingen", "Amel", "Burg-Reuland", "St. Vith"
    ]

df["region"] = df.apply(map_region, axis=1)

In [19]:
df.region.value_counts()

region
Flanders                     46362
Wallonia                     21761
Brussels                      8124
German-speaking Community      123
Name: count, dtype: int64

In [20]:
df.postCode.value_counts()

postCode
8300    2169
8400    1219
9000    1027
1180     926
1000     866
        ... 
6986       1
3732       1
3742       1
3792       1
7521       1
Name: count, Length: 1098, dtype: int64

In [21]:
# Move "Region" column to left of "province"
region_series = df.pop("region")
province_idx = df.columns.get_loc("province")
df.insert(province_idx, "region", region_series)

# Strip whitespace in locality, region, province, postCode
df[["locality", "region", "province", "postCode"]] = df[["locality", "region", "province", "postCode"]].astype(str).apply(lambda x: x.str.strip())

In [22]:
Missing(df[["locality", "region", "province", "postCode"]])

Good news! There are no missing values.


In [23]:
# Discard locality as it is the same as postCode
dropVar += ["locality"]
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality']

### ROOMS COUNT

#### BEDROOMCOUNT & ROOMCOUNT

In [24]:
# These variables are very similar (advertisers often do not distinguish between bedroom and room)
# and roomCount present high number of missing values
Missing(df[['bedroomCount', 'roomCount']])

Count of missing values:
bedroomCount     2820
roomCount       54423
dtype: int64

Percentage of missing values:
bedroomCount    3.69
roomCount      71.26
dtype: float64 



In [25]:
# for records with missing bedroomCount, take roomCount if exists
df['bedroomCount'] = df['bedroomCount'].fillna(df['roomCount'])

In [26]:
# Nan remaining filled with No info
df['bedroomCount'] = df['bedroomCount'].fillna('No info')

In [27]:
# Then discard roomCount
dropVar += ['roomCount']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount']

#### HASDRESSINGROOM, DININGROOMSURFACE & HASDININGROOM


In [28]:
Missing(df[['hasDressingRoom', 'diningRoomSurface', 'hasDiningRoom','hasLivingRoom','livingRoomSurface']])

Count of missing values:
hasDressingRoom      73742
diningRoomSurface    69471
hasDiningRoom        62231
hasLivingRoom        33500
livingRoomSurface    47423
dtype: int64

Percentage of missing values:
hasDressingRoom     96.56
diningRoomSurface   90.97
hasDiningRoom       81.49
hasLivingRoom       43.87
livingRoomSurface   62.10
dtype: float64 



In [29]:
# Discard the variables
dropVar += ['hasDressingRoom', 'diningRoomSurface', 'hasDiningRoom','hasLivingRoom','livingRoomSurface']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface']

#### HASOFFICE

In [30]:
# hasoffice may be a nice
# but how to differenciate with nuumber of rooms and bedrooms?
# Is it counted twice?
# also shows many missing values
Missing(df['hasOffice'])

Count of missing values:
65985

Percentage of missing values:
86.4 



In [31]:
# Discard the variables
dropVar += ['hasOffice']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface',
 'hasOffice']

#### KITCHENSURFACE & KITCHEN TYPE


In [32]:
Missing(df[['kitchenSurface', 'kitchenType']])

Count of missing values:
kitchenSurface    52109
kitchenType       34458
dtype: int64

Percentage of missing values:
kitchenSurface   68.23
kitchenType      45.12
dtype: float64 



In [33]:
# kitchenType can be relevant even though it has a large number of Nan
# Recode Nan as No info category
df['kitchenType'] = df['kitchenType'].fillna('No info')


In [34]:
# Discard the variables
dropVar += ['kitchenSurface']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface',
 'hasOffice',
 'kitchenSurface']

#### BATHROOMCOUNT & TOILETCOUNT

These variables are very similar
and there is no information about separate or not toilet

In [35]:
Missing(df[['bathroomCount', 'toiletCount']])

Count of missing values:
bathroomCount     9705
toiletCount      21280
dtype: int64

Percentage of missing values:
bathroomCount   12.71
toiletCount     27.86
dtype: float64 



In [36]:
# First we assume that any property that has missing value for bathroomCount and has 1 toiletCount, has 1 bathroom
mask = df['bathroomCount'].isna() & (df['toiletCount'] == 1)
df.loc[mask, 'bathroomCount'] = 1

In [37]:
# For the moment, we keep toiletCount as may be used to fill bathroomCount
# Nan remaining as No info
df['bathroomCount'] = df['bathroomCount'].fillna('No info')
df['toiletCount'] = df['toiletCount'].fillna('No info')

In [38]:
Missing(df[['bathroomCount', 'toiletCount']])

Good news! There are no missing values.


### PARKINGCOUNTINDOOR & PARKINGCOUNTOUTDOOR


In [39]:
Missing(df[['parkingCountIndoor', 'parkingCountOutdoor']])

Count of missing values:
parkingCountIndoor     48567
parkingCountOutdoor    58275
dtype: int64

Percentage of missing values:
parkingCountIndoor    63.59
parkingCountOutdoor   76.31
dtype: float64 



In [40]:
# Parking is a valuable asset, if seller dont specify then we assume there is no parking
df['parkingCountIndoor'] = df['parkingCountIndoor'].fillna(0)
df['parkingCountOutdoor'] = df['parkingCountOutdoor'].fillna(0)

In [41]:
Missing(df[['parkingCountIndoor', 'parkingCountOutdoor']])

Good news! There are no missing values.


### HASGARDEN, GARDENSURFACE, GARDENORIENTATION & HASTERRACE, TERRACESURFACE, TERRACEORIENTATIONn


In [42]:
Missing(df[['hasGarden', 'gardenSurface', 'gardenOrientation', 'hasTerrace', 'terraceSurface', 'terraceOrientation']])

Count of missing values:
hasGarden             60413
gardenSurface         60413
gardenOrientation     70769
hasTerrace            28943
terraceSurface        47772
terraceOrientation    65108
dtype: int64

Percentage of missing values:
hasGarden            79.11
gardenSurface        79.11
gardenOrientation    92.67
hasTerrace           37.90
terraceSurface       62.55
terraceOrientation   85.25
dtype: float64 



In [43]:
print(df['hasGarden'].value_counts())
print(df['hasTerrace'].value_counts())

hasGarden
True    15957
Name: count, dtype: int64
hasTerrace
True    47427
Name: count, dtype: int64


In [44]:
# Recode to 1 (has) and O (don't)
df['hasGarden'] = df['hasGarden'].apply(lambda x: 1 if x == True else 0)
df['hasTerrace'] = df['hasTerrace'].apply(lambda x: 1 if x == True else 0)

In [45]:
# If no garden / no terrace then surface is 0
df.loc[df['hasGarden'] == 0, 'gardenSurface'] = 0
df.loc[df['hasTerrace'] == 0, 'terraceSurface'] = 0

In [46]:
# If no garden / no terrace then Orientation is empty
df.loc[df['hasGarden'] == 0, 'gardenOrientation'] = "No garden"
df.loc[df['hasTerrace'] == 0, 'terraceOrientation'] = "No terrace"



In [47]:
df['gardenSurface'] = df['gardenSurface'].fillna('No info')
df['terraceSurface'] = df['terraceSurface'].fillna('No info')
df['gardenOrientation'] = df['gardenOrientation'].fillna('No info')
df['terraceOrientation'] = df['terraceOrientation'].fillna('No info')

In [48]:
# Finally we decide to remove Surface and Orientation as they have lower importance
dropVar += ['gardenSurface', 'gardenOrientation', 'terraceSurface', 'terraceOrientation']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface',
 'hasOffice',
 'kitchenSurface',
 'gardenSurface',
 'gardenOrientation',
 'terraceSurface',
 'terraceOrientation']

In [49]:
'hasGarden', 'gardenSurface', 'gardenOrientation', 'hasTerrace', 'terraceSurface', 'terraceOrientation'

('hasGarden',
 'gardenSurface',
 'gardenOrientation',
 'hasTerrace',
 'terraceSurface',
 'terraceOrientation')

### HASSWIMMINGPOOL

In [50]:
df['hasSwimmingPool'].value_counts()

hasSwimmingPool
True    1816
Name: count, dtype: int64

In [51]:
# hasswimmingpool has many missing values,but in Belgium it is rare to have a swimming pool
# we can assume that missing values mean there is no swimming pool
# and that having swimming pool correlates with price
df['hasSwimmingPool'] = df['hasSwimmingPool'].apply(lambda x: 1 if x == True else 0)

### HASVISIOPHONE & HASARMOREDDOOR & HASAIRCONDITIONING & HASFIREPLACE


In [52]:
# These may be considered as extras but probably not one that a buyer would consider
# and shows many missing values
# and a feature that a seller would not care about informing
Missing(df[['hasArmoredDoor', 'hasVisiophone', 'hasAirConditioning', 'hasFireplace']])

Count of missing values:
hasArmoredDoor        72672
hasVisiophone         60379
hasAirConditioning    75246
hasFireplace          73326
dtype: int64

Percentage of missing values:
hasArmoredDoor       95.16
hasVisiophone        79.06
hasAirConditioning   98.53
hasFireplace         96.01
dtype: float64 



In [53]:
# Discard the variables
dropVar += ['hasArmoredDoor', 'hasVisiophone', 'hasAirConditioning', 'hasFireplace']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface',
 'hasOffice',
 'kitchenSurface',
 'gardenSurface',
 'gardenOrientation',
 'terraceSurface',
 'terraceOrientation',
 'hasArmoredDoor',
 'hasVisiophone',
 'hasAirConditioning',
 'hasFireplace']

### FLOOD ZONE TYPE


In [54]:
Missing(df['floodZoneType'])

Count of missing values:
33466

Percentage of missing values:
43.82 



In [55]:
# Whether the property is in a potential flooding area is relevant
# Recode Nan to No info
df['floodZoneType'] = df['floodZoneType'].fillna('No info')

### HASATTIC & HASBASEMENT


In [56]:
Missing(df[['hasAttic','hasBasement']])

Count of missing values:
hasAttic       63856
hasBasement    47057
dtype: int64

Percentage of missing values:
hasAttic      83.61
hasBasement   61.62
dtype: float64 



In [57]:
# There are many Nan but this is a feature a seller would highlight
# Recoding 0/1 and assuming Nan means no attic or basement
df['hasAttic'] = df['hasAttic'].apply(lambda x: 1 if x == True else 0)
df['hasBasement'] = df['hasBasement'].apply(lambda x: 1 if x == True else 0)

### HASLIFT

In [58]:
Missing(df.loc[df['type'] == 'House', 'hasLift'])

Count of missing values:
44558

Percentage of missing values:
98.52 



In [59]:
# Most houses dont have a lift
df.loc[(df['type'] == 'House') & (df['hasLift'].isna()), 'hasLift'] = 0

In [60]:
Missing(df.loc[df['type'] == 'Apartment', 'hasLift'])

Count of missing values:
12769

Percentage of missing values:
41.0 



In [61]:
# Apartments often have a lift we can assume that 41% Nan mean No lift, for 59% with lift
df.loc[(df['type'] == 'Apartment') & (df['hasLift'].isna()), 'hasLift'] = 0

### HEATING

In [62]:
Missing(df[['heatingType']])

Count of missing values:
heatingType    29220
dtype: int64

Percentage of missing values:
heatingType   38.26
dtype: float64 



In [63]:
Missing(df[['hasHeatPump', 'hasPhotovoltaicPanels', 'hasThermicPanels']])

Count of missing values:
hasHeatPump              68897
hasPhotovoltaicPanels    68322
hasThermicPanels         73258
dtype: int64

Percentage of missing values:
hasHeatPump             90.21
hasPhotovoltaicPanels   89.46
hasThermicPanels        95.93
dtype: float64 



In [64]:
# 🛠 Convertir les colonnes en 0/1 (si ce sont des floats)
for col in ["hasHeatPump", "hasPhotovoltaicPanels", "hasThermicPanels"]:
    df[col] = df[col].fillna(0).astype(int)

In [65]:
# 🔆 Créer un masque pour les lignes où au moins une source solaire est activée
solar_mask = (
    (df["hasThermicPanels"] == 1) | (df["hasPhotovoltaicPanels"] == 1)
)

In [66]:
# ✏️ Modifier uniquement les lignes où heatingType est manquant
modif_count = df.loc[solar_mask & df["heatingType"].isna(), "heatingType"].shape[0]
df.loc[solar_mask & df["heatingType"].isna(), "heatingType"] = "SOLAR"
print(f"✅ {modif_count} lignes modifiées avec 'SOLAR' dans 'heatingtype'.")

✅ 6102 lignes modifiées avec 'SOLAR' dans 'heatingtype'.


In [67]:
# heatingType fill Nan with Heat pump where True
df.loc[(df["heatingType"].isna()) & (df["hasHeatPump"] == 1), "heatingType"] = "Heat Pump"

In [68]:
df["heatingType"].value_counts()

heatingType
GAS          33210
FUELOIL       8969
SOLAR         6166
ELECTRIC      4042
Heat Pump     2773
PELLET         647
WOOD           169
CARBON          49
Name: count, dtype: int64

In [69]:
# Recode Nan to No info
df['heatingType'] = df['heatingType'].fillna('No info')

In [70]:
# Drop variables used to fill heatingType
dropVar += ["hasHeatPump", "hasPhotovoltaicPanels", "hasThermicPanels"]
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface',
 'hasOffice',
 'kitchenSurface',
 'gardenSurface',
 'gardenOrientation',
 'terraceSurface',
 'terraceOrientation',
 'hasArmoredDoor',
 'hasVisiophone',
 'hasAirConditioning',
 'hasFireplace',
 'hasHeatPump',
 'hasPhotovoltaicPanels',
 'hasThermicPanels']

### BUILDINGCONDITION & BUILDINGCONSTRUCTIONYEAR



In [71]:
Missing(df[['buildingCondition', 'buildingConstructionYear']])

Count of missing values:
buildingCondition           18381
buildingConstructionYear    27268
dtype: int64

Percentage of missing values:
buildingCondition          24.07
buildingConstructionYear   35.71
dtype: float64 



In [72]:
df['buildingCondition'] = df['buildingCondition'].fillna('No info')

In [73]:
# Discard the variables
dropVar += ['buildingConstructionYear']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface',
 'hasOffice',
 'kitchenSurface',
 'gardenSurface',
 'gardenOrientation',
 'terraceSurface',
 'terraceOrientation',
 'hasArmoredDoor',
 'hasVisiophone',
 'hasAirConditioning',
 'hasFireplace',
 'hasHeatPump',
 'hasPhotovoltaicPanels',
 'hasThermicPanels',
 'buildingConstructionYear']

### FACEDECOUNT, FLOORCOUNT, STREETFACADEWIDTH

In [74]:
Missing(df[['facedeCount', 'floorCount', 'streetFacadeWidth']])


Count of missing values:
facedeCount          23136
floorCount           38827
streetFacadeWidth    60860
dtype: int64

Percentage of missing values:
facedeCount         30.29
floorCount          50.84
streetFacadeWidth   79.69
dtype: float64 



In [75]:
# There is a high amount of Nan
# and variables less relevant
# Drop variables
dropVar += ['facedeCount', 'floorCount', 'streetFacadeWidth']
dropVar

['monthlyCost',
 'hasBalcony',
 'accessibleDisabledPeople',
 'Unnamed: 0',
 'id',
 'url',
 'locality',
 'roomCount',
 'hasDressingRoom',
 'diningRoomSurface',
 'hasDiningRoom',
 'hasLivingRoom',
 'livingRoomSurface',
 'hasOffice',
 'kitchenSurface',
 'gardenSurface',
 'gardenOrientation',
 'terraceSurface',
 'terraceOrientation',
 'hasArmoredDoor',
 'hasVisiophone',
 'hasAirConditioning',
 'hasFireplace',
 'hasHeatPump',
 'hasPhotovoltaicPanels',
 'hasThermicPanels',
 'buildingConstructionYear',
 'facedeCount',
 'floorCount',
 'streetFacadeWidth']

### HABITABLE SURFACE

In [76]:
Missing(df['habitableSurface'])

Count of missing values:
8590

Percentage of missing values:
11.25 



In [77]:
# Very relevant, recode Nan to No info
df['habitableSurface'] = df['habitableSurface'].fillna('No info')

### LAND SURFACE

In [78]:
Missing(df['landSurface'])

Count of missing values:
36833

Percentage of missing values:
48.230000000000004 



In [79]:
# Many Nan but expected as Apartments should have No land
df['landSurface'] = df['landSurface'].astype('object')
df.loc[(df['type'] == 'Apartment') & (df['landSurface'].isna()), 'landSurface'] = "Apt: no land"

# For houses, fillNA with No info on land
df.loc[(df['type'] == 'House') & (df['landSurface'].isna()), 'landSurface'] = "No info on land"

### EPC SCORE

In [80]:
Missing(df['epcScore'])

Count of missing values:
11966

Percentage of missing values:
15.67 



In [81]:
df['epcScore'].value_counts()

epcScore
B      13125
C      11920
D       9875
F       8531
A       8460
E       6488
G       4263
A+      1445
A++      276
G_C        3
F_D        3
E_C        2
C_B        2
E_D        2
F_C        2
C_A        2
G_F        2
G_E        1
D_C        1
X          1
Name: count, dtype: int64

In [82]:
# epcScore is relevant
# It has some wrong values and 15% nan: replace with No info
correct = ['A', 'A+', 'A++', 'B', 'C', 'D', 'E', 'F', 'G']
df['epcScore'] = df['epcScore'].where(df['epcScore'].isin(correct), 'No info')

## OUTPUT DATA

In [83]:
print(f"Cleaning ends by dropping {len(dropVar)} columns")

Cleaning ends by dropping 30 columns


In [84]:
# Drop the colums
df = df.drop(dropVar, axis=1)

In [85]:
# Check no missing values
Missing(df)

Good news! There are no missing values.


In [86]:
# Split the variables into numerical and categorical cols, will be useful later for the analysis
numeric_cols = df.select_dtypes(include=np.number).columns
numeric_cols

categoric_cols = df.select_dtypes(exclude=np.number).columns
categoric_cols

Index(['hasAttic', 'hasBasement', 'hasGarden', 'parkingCountIndoor',
       'parkingCountOutdoor', 'hasSwimmingPool', 'hasTerrace', 'price'],
      dtype='object')

Index(['type', 'subtype', 'bedroomCount', 'bathroomCount', 'region',
       'province', 'postCode', 'habitableSurface', 'buildingCondition',
       'hasLift', 'floodZoneType', 'heatingType', 'kitchenType', 'landSurface',
       'toiletCount', 'epcScore'],
      dtype='object')

In [87]:
# Reduce the memory used
MemOptimisation(df)


Amount of memory used by all attributes: 72194385

<class 'pandas.core.frame.DataFrame'>
Index: 76370 entries, 0 to 80367
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   type                 76370 non-null  category
 1   subtype              76370 non-null  category
 2   bedroomCount         76370 non-null  category
 3   bathroomCount        76370 non-null  category
 4   region               76370 non-null  category
 5   province             76370 non-null  category
 6   postCode             76370 non-null  category
 7   habitableSurface     76370 non-null  category
 8   hasAttic             76370 non-null  int8    
 9   hasBasement          76370 non-null  int8    
 10  buildingCondition    76370 non-null  category
 11  hasLift              76370 non-null  category
 12  floodZoneType        76370 non-null  category
 13  heatingType          76370 non-null  category
 14  kitchenType          76

In [88]:
# Display info about the data remaininig
Descriptives(df)

Looking at the shape to see the number of records: (76370, 24)

Description of the file to check values range:
                       count      mean       std     min       25%       50%       75%         max
hasAttic            76370.00      0.16      0.37    0.00      0.00      0.00      0.00        1.00
hasBasement         76370.00      0.38      0.49    0.00      0.00      0.00      1.00        1.00
hasGarden           76370.00      0.21      0.41    0.00      0.00      0.00      0.00        1.00
parkingCountIndoor  76370.00      2.39    261.48    0.00      0.00      0.00      1.00    50000.00
parkingCountOutdoor 76370.00      1.97    365.51    0.00      0.00      0.00      0.00   101010.00
hasSwimmingPool     76370.00      0.02      0.15    0.00      0.00      0.00      0.00        1.00
hasTerrace          76370.00      0.62      0.49    0.00      0.00      1.00      1.00        1.00
price               76370.00 447606.06 511564.94 3141.00 230000.00 329900.00 479000.00 15000000.0

In [89]:
# Generate the report
profile = ProfileReport(df,title="Immoweb: Data Profile")

# Save the report to .html
profile.to_file("Output/Immoweb_cleaned - data profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:01<00:00, 22.57it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [90]:
# Save the data cleaned
df.to_csv("Data/Cleaned - immoweb-dataset.csv", index=False, encoding="utf-8-sig")
print("Cleaned data is saved to: Data/Cleaned - immoweb-dataset.csv")


Cleaned data is saved to: Data/Cleaned - immoweb-dataset.csv
