# Cleaning

### Data exploration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from functions import *

In [3]:
# Open the raw datasets
# Immo datas
immo = pd.read_csv('../../Datasets/raw_immo_scrap.csv', sep=',', na_values=('None', 'no price'))

# localiy to commune
PC_commune = pd.read_csv('../../Datasets/liste-des-codes-postaux-belges-fr.csv', sep=';')

# Communes with polygons
commune_data = pd.read_csv('../../Datasets/communes-belges-2019.csv', sep = ';')
# https://public.opendatasoft.com/explore/dataset/communes-belges-2019/table/

In [4]:
immo.shape

(52077, 20)

In [5]:
immo

Unnamed: 0,locality,type_of_property,subtype_of_property,price,type_of_sale,number_of_rooms,house_area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,surface_of_the_plot_of_land,number_of_facades,swimming_pool,state_of_the_building,construction_year
0,1050,house,house,340000.0,for sale,6.0,203.0,1,,0,1,,0,,95.0,,2.0,0,to be done up,1901.0
1,1880,house,villa,525000.0,for sale,6.0,250.0,1,,0,1,40.0,1,430.0,826.0,,4.0,0,as new,1992.0
2,4900,house,exceptional property,550000.0,for sale,11.0,475.0,1,,0,1,,1,1400.0,1543.0,,4.0,0,good,1853.0
3,7912,house,villa,550000.0,for sale,4.0,325.0,1,,0,1,125.0,1,2333.0,3570.0,,4.0,0,good,1918.0
4,6032,house,house,550000.0,for sale,5.0,400.0,1,,0,1,80.0,1,500.0,616.0,,3.0,0,as new,1977.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52072,3120,house,house,455496.0,for sale,3.0,,0,,0,0,,0,,695.0,,,0,,
52073,1800,house,house,451650.0,for sale,3.0,,0,,0,0,,0,,550.0,,3.0,0,,
52074,2018,house,house,488000.0,for sale,3.0,145.0,0,,0,0,,1,48.0,0.0,,3.0,0,,
52075,9140,house,house,455000.0,for sale,3.0,,0,,0,0,,0,,1202.0,,4.0,0,,


In [6]:
immo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52077 entries, 0 to 52076
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   locality                     52077 non-null  int64  
 1   type_of_property             52077 non-null  object 
 2   subtype_of_property          52077 non-null  object 
 3   price                        52007 non-null  float64
 4   type_of_sale                 52077 non-null  object 
 5   number_of_rooms              52075 non-null  float64
 6   house_area                   43445 non-null  float64
 7   fully_equipped_kitchen       52077 non-null  int64  
 8   furnished                    0 non-null      float64
 9   open_fire                    52077 non-null  int64  
 10  terrace                      52077 non-null  int64  
 11  terrace_area                 17518 non-null  float64
 12  garden                       52077 non-null  int64  
 13  garden_area     

In [7]:
immo.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
locality,52077,,,,5406.22,2958.59,1000.0,2600.0,5060.0,8430.0,9992.0
type_of_property,52077,2.0,house,29353.0,,,,,,,
subtype_of_property,52077,24.0,house,22100.0,,,,,,,
price,52007,,,,305756.0,167907.0,1000.0,195000.0,267000.0,370000.0,950000.0
type_of_sale,52077,1.0,for sale,52077.0,,,,,,,
number_of_rooms,52075,,,,2.81458,2.20298,0.0,2.0,3.0,3.0,204.0
house_area,43445,,,,153.721,183.355,1.0,90.0,128.0,185.0,31700.0
fully_equipped_kitchen,52077,,,,0.652284,0.47625,0.0,0.0,1.0,1.0,1.0
furnished,0,,,,,,,,,,
open_fire,52077,,,,0.0469497,0.211533,0.0,0.0,0.0,0.0,1.0


In [8]:
immo.dtypes

locality                         int64
type_of_property                object
subtype_of_property             object
price                          float64
type_of_sale                    object
number_of_rooms                float64
house_area                     float64
fully_equipped_kitchen           int64
furnished                      float64
open_fire                        int64
terrace                          int64
terrace_area                   float64
garden                           int64
garden_area                    float64
surface_of_the_land            float64
surface_of_the_plot_of_land    float64
number_of_facades              float64
swimming_pool                    int64
state_of_the_building           object
construction_year              float64
dtype: object

In [9]:
immo.isnull().sum()

locality                           0
type_of_property                   0
subtype_of_property                0
price                             70
type_of_sale                       0
number_of_rooms                    2
house_area                      8632
fully_equipped_kitchen             0
furnished                      52077
open_fire                          0
terrace                            0
terrace_area                   34559
garden                             0
garden_area                    43624
surface_of_the_land            22724
surface_of_the_plot_of_land    52077
number_of_facades              13650
swimming_pool                      0
state_of_the_building          13586
construction_year              21369
dtype: int64

### Cleaning

###### Drop useless columns (not filled/all the same)

In [10]:
immo = immo.drop(["furnished", "surface_of_the_plot_of_land", "type_of_sale"], axis=1)

###### Drop rows with null in essential columns

In [11]:
immo.dropna(axis=0, inplace=True, subset=['house_area'])
immo.dropna(axis=0, inplace=True, subset=['price'])
immo.isna().sum()

locality                      0
type_of_property              0
subtype_of_property           0
price                         0
number_of_rooms               2
house_area                    0
fully_equipped_kitchen        0
open_fire                     0
terrace                       0
terrace_area              26488
garden                        0
garden_area               35339
surface_of_the_land       20649
number_of_facades         11064
swimming_pool                 0
state_of_the_building     10615
construction_year         16527
dtype: int64

###### Replace some non essential unknow values by 0, we will rework them after

In [12]:
immo['terrace_area'] = immo['terrace_area'].fillna(0)
immo['garden_area'] = immo['garden_area'].fillna(0)
immo['surface_of_the_land'] = immo['surface_of_the_land'].fillna(0)

###### Drop Duplicates

In [13]:
immo.shape

(43401, 17)

In [14]:
immo = immo.drop_duplicates()

In [15]:
immo.shape

(42368, 17)

###### Setting the type

In [16]:
immo.dtypes

locality                    int64
type_of_property           object
subtype_of_property        object
price                     float64
number_of_rooms           float64
house_area                float64
fully_equipped_kitchen      int64
open_fire                   int64
terrace                     int64
terrace_area              float64
garden                      int64
garden_area               float64
surface_of_the_land       float64
number_of_facades         float64
swimming_pool               int64
state_of_the_building      object
construction_year         float64
dtype: object

In [17]:
immo['price'] = immo['price'].astype('int64')
immo['number_of_rooms'] = immo['number_of_rooms'].astype('Int64')
immo['house_area'] = immo['house_area'].astype('int64')
immo['terrace_area'] = immo['terrace_area'].astype('Int64')
immo['garden_area'] = immo['garden_area'].astype('Int64')
immo['surface_of_the_land'] = immo['surface_of_the_land'].astype('Int64')
immo['number_of_facades'] = immo['number_of_facades'].astype('Int64')
immo['construction_year'] = immo['construction_year'].astype('Int64')

In [18]:
immo.dtypes

locality                   int64
type_of_property          object
subtype_of_property       object
price                      int64
number_of_rooms            Int64
house_area                 int64
fully_equipped_kitchen     int64
open_fire                  int64
terrace                    int64
terrace_area               Int64
garden                     int64
garden_area                Int64
surface_of_the_land        Int64
number_of_facades          Int64
swimming_pool              int64
state_of_the_building     object
construction_year          Int64
dtype: object

###### Rename locality as we will have another locality

In [19]:
immo.rename(columns={'locality': 'zip'}, inplace=True)

###### Decision : wich variables are essentials ?

Arbitrarily, I decided that the locality, the price, the house area, the surface of the land and the state of the building are essentials.
Others columns will be cleaned on demand only, but these 5 will always be cleaned (but some ouliers will be cut off based on other categories).

###### Locality

In [26]:
PC_commune

Unnamed: 0,Code postal,Localité,Sous-commune,Commune principale,Province
0,5670,Mazée,Oui,VIROINVAL,Namur
1,1860,MEISE,Non,MEISE,Brabant Flamand
2,9700,Melden,Oui,OUDENAARDE,Flandre-Orientale
3,1370,Mélin,Oui,JODOIGNE,Brabant Wallon
4,4520,Moha,Oui,WANZE,Liège
...,...,...,...,...,...
2820,8400,Zandvoorde,Oui,OOSTENDE,Flandre-Occidentale
2821,3800,Zepperen,Oui,SINT-TRUIDEN,Limbourg
2822,2230,HERSELT,Non,HERSELT,Anvers
2823,6987,Hodister,Oui,RENDEUX,Luxembourg


In [27]:
PC_commune.rename(columns={'Code postal' : 'zip',
                           'Localité' : 'locality',
                        'Commune principale': 'commune',
                        'Province' : 'province'}, inplace=True)

In [28]:
PC_commune = PC_commune.drop(['Sous-commune'], axis=1)

In [29]:
PC_commune.isnull().sum()

zip          0
locality     0
commune     44
province    44
dtype: int64

In [30]:
PC_commune[PC_commune['commune'].isnull()]

Unnamed: 0,zip,locality,commune,province
115,1733,HighCo DATA,,
219,5012,Parlement Wallon,,
239,1049,Union Européenne - Commission,,
240,1048,Union Européenne - Conseil,,
315,5589,Jemelle,,
739,1804,Cargovil,,
796,1934,Office Exchange Brussels Airport Remailing,,
838,1008,Chambre des Représentants,,
890,1031,Organisations Sociales Chrétiennes,,
909,1011,Vlaams parlement,,


In [31]:
PC_commune.dropna(axis=0, inplace=True, subset=['commune'])
PC_commune.reset_index(drop=True, inplace=True)

In [32]:
PC_commune['commune'].value_counts()

TOURNAI               30
NAMUR                 25
TONGEREN              19
ATH                   19
MONS                  19
                      ..
EDEGEM                 1
SCHOTEN                1
EEKLO                  1
VORSELAAR              1
RHODE-SAINT-GENÈSE     1
Name: commune, Length: 581, dtype: int64

In [33]:
PC_commune[PC_commune['locality'] == "SAINT-NICOLAS"]

Unnamed: 0,zip,locality,commune,province
688,4420,SAINT-NICOLAS,SAINT-NICOLAS,Liège


In [34]:
PC_commune.iloc[688, [0]] = 'Saint-Nicolas'

In [36]:
immo = pd.merge(immo, PC_commune, on=['zip'], how='inner')

In [37]:
immo

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province
0,1050,house,house,340000,6,203,1,0,1,0,0,0,95,2,0,to be done up,1901,IXELLES,IXELLES,Bruxelles (19 communes)
1,1050,house,mixed use building,520000,4,200,0,0,0,0,0,0,69,2,0,to renovate,1940,IXELLES,IXELLES,Bruxelles (19 communes)
2,1050,house,house,599000,4,160,1,0,1,0,1,55,100,2,0,to be done up,1898,IXELLES,IXELLES,Bruxelles (19 communes)
3,1050,house,house,599000,3,160,1,0,1,15,1,60,130,2,0,good,1953,IXELLES,IXELLES,Bruxelles (19 communes)
4,1050,house,house,575000,3,171,0,0,0,0,0,0,46,2,0,just renovated,,IXELLES,IXELLES,Bruxelles (19 communes)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121066,1472,house,villa,475000,5,215,1,0,1,0,0,0,1550,,1,good,,Vieux-Genappe,GENAPPE,Brabant Wallon
121067,1461,house,villa,499000,5,275,1,0,1,0,1,0,1561,4,0,,1983,Haut-Ittre,ITTRE,Brabant Wallon
121068,6686,house,chalet,495000,4,227,1,0,1,35,1,4400,4446,4,0,good,1983,Flamierge,BERTOGNE,Luxembourg
121069,1761,house,villa,495000,4,235,1,0,0,0,1,0,488,4,0,,2020,Borchtlombeek,ROOSDAAL,Brabant Flamand


In [38]:
immo['region'] = immo['province'].apply(get_region)

In [39]:
immo['commune'].value_counts()

KNOKKE-HEIST    2791
OUDENAARDE      2520
TONGEREN        2508
LIÈGE           2471
OOSTENDE        2271
                ... 
FLOBECQ            4
RUMES              4
AS                 3
KALMTHOUT          3
HERSTAPPE          1
Name: commune, Length: 580, dtype: int64

###### Rank the communes by mean(price)

In [40]:
Dict_city_rank_price = immo.groupby('commune')['price'].mean().sort_values().rank().to_dict()

In [41]:
Dict_city_rank_price

{'QUIÉVRAIN': 1.0,
 'DOUR': 2.0,
 'MOMIGNIES': 3.0,
 'FROIDCHAPELLE': 4.0,
 'ANTOING': 5.0,
 'COLFONTAINE': 6.0,
 'LIMBOURG': 7.0,
 'AMBLÈVE': 8.0,
 'QUAREGNON': 9.0,
 'FRAMERIES': 10.0,
 'RENDEUX': 11.0,
 'SAINT-NICOLAS': 12.0,
 'VIROINVAL': 13.0,
 'ERQUELINNES': 14.0,
 'BOUSSU': 15.0,
 'HASTIÈRE': 16.0,
 'CERFONTAINE': 17.0,
 'FARCIENNES': 18.0,
 'CHARLEROI': 19.0,
 'HENSIES': 20.0,
 'DISON': 21.0,
 'FLORENVILLE': 22.0,
 'SIVRY-RANCE': 23.0,
 'ENGIS': 24.0,
 'MERBES-LE-CHÂTEAU': 25.0,
 'CHÂTELET': 26.0,
 'AMAY': 27.0,
 'SERAING': 28.0,
 'BINCHE': 29.0,
 'BEAUMONT': 30.0,
 'FLEURUS': 31.0,
 'SAINT-HUBERT': 32.0,
 'WALCOURT': 33.0,
 'ANDERLUES': 34.0,
 'MORLANWELZ': 35.0,
 'LA LOUVIÈRE': 36.0,
 'BOUILLON': 37.0,
 'MONS': 38.0,
 'PHILIPPEVILLE': 39.0,
 'SAINT-GHISLAIN': 40.0,
 'BEAURAING': 41.0,
 'COURCELLES': 42.0,
 'ANTHISNES': 43.0,
 'CHIÈVRES': 44.0,
 'GRÂCE-HOLLOGNE': 45.0,
 'ROUVROY': 46.0,
 'TROOZ': 47.0,
 'MANAGE': 48.0,
 "FONTAINE-L'EVÊQUE": 49.0,
 'HOTTON': 50.0,
 'HEUVELLAND'

###### Subtype of property

In [42]:
# Subtype of property is related to the property's type.

print_unique_dtype(immo, 'subtype_of_property')
immo.subtype_of_property.value_counts()

24 - object


house                   54101
apartment               37592
villa                    7449
duplex                   3596
ground floor             2693
mixed use building       2264
apartment block          2074
penthouse                1810
flat studio              1429
exceptional property     1292
mansion                  1259
country cottage          1078
service flat              820
town house                795
bungalow                  681
loft                      534
chalet                    513
farmhouse                 424
manor house               235
other property            166
triplex                   152
kot                        84
castle                     27
pavilion                    3
Name: subtype_of_property, dtype: int64

In [43]:
# Drop castle & pavilion.
immo = immo[(immo['subtype_of_property'] != 'castle') & 
        (immo['subtype_of_property'] != 'pavilion') & 
        (immo['subtype_of_property'] != 'apartment block')]

print_unique_dtype(immo, 'subtype_of_property')
immo.shape

21 - object


(118967, 21)

###### Price

In [44]:
print_unique_dtype(immo, 'price')

257 - int64


In [45]:
immo.price.describe()

count    118967.000000
mean     296958.438306
std      159604.920651
min        2500.000000
25%      193000.000000
50%      260000.000000
75%      350000.000000
max      950000.000000
Name: price, dtype: float64

In [46]:
immo[immo['price'] == 2500]

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region
22343,1400,apartment,apartment,2500,3,90,1,0,1,4,...,0,0,4,0,as new,1978,Monstreux,NIVELLES,Brabant Wallon,Région flamande
22344,1400,apartment,apartment,2500,3,90,1,0,1,4,...,0,0,4,0,as new,1978,NIVELLES,NIVELLES,Brabant Wallon,Région flamande
96474,2530,house,exceptional property,2500,5,600,1,1,1,60,...,1750,1750,4,0,as new,1842,BOECHOUT,BOECHOUT,Anvers,Région flamande
109180,6960,house,other property,2500,3,154,1,1,1,18,...,3000,3000,4,0,as new,2006,Vaux-Chavanne,MANHAY,Luxembourg,Région wallonne
109181,6960,house,other property,2500,3,154,1,1,1,18,...,3000,3000,4,0,as new,2006,MANHAY,MANHAY,Luxembourg,Région wallonne
109182,6960,house,other property,2500,3,154,1,1,1,18,...,3000,3000,4,0,as new,2006,Odeigne,MANHAY,Luxembourg,Région wallonne
109183,6960,house,other property,2500,3,154,1,1,1,18,...,3000,3000,4,0,as new,2006,Harre,MANHAY,Luxembourg,Région wallonne
109184,6960,house,other property,2500,3,154,1,1,1,18,...,3000,3000,4,0,as new,2006,Dochamps,MANHAY,Luxembourg,Région wallonne
109185,6960,house,other property,2500,3,154,1,1,1,18,...,3000,3000,4,0,as new,2006,Malempré,MANHAY,Luxembourg,Région wallonne
109186,6960,house,other property,2500,3,154,1,1,1,18,...,3000,3000,4,0,as new,2006,Grandménil,MANHAY,Luxembourg,Région wallonne


In [47]:
# Remove too much lower price, taking in consideration the house area
immo = immo[(immo['price'] > 10000)]
immo = immo[(immo['price'] > 35000) | (immo['house_area'] < 80)]

In [48]:
immo.price.describe()

count    118726.000000
mean     297515.466368
std      159286.249961
min       11825.000000
25%      194500.000000
50%      260000.000000
75%      350000.000000
max      950000.000000
Name: price, dtype: float64

In [49]:
immo[immo['price'] < 30000]

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region
4068,4000,apartment,ground floor,28000,2,25,0,0,0,0,...,0,0,1,0,,,Glain,LIÈGE,Liège,Région wallonne
4069,4000,apartment,ground floor,28000,2,25,0,0,0,0,...,0,0,1,0,,,LIÈGE,LIÈGE,Liège,Région wallonne
4070,4000,apartment,ground floor,28000,2,25,0,0,0,0,...,0,0,1,0,,,Rocourt,LIÈGE,Liège,Région wallonne
23317,8301,apartment,flat studio,20000,0,42,0,0,0,0,...,0,0,2,1,to renovate,1971.0,Ramskapelle,KNOKKE-HEIST,Flandre-Occidentale,Région flamande
23318,8301,apartment,flat studio,20000,0,42,0,0,0,0,...,0,0,2,1,to renovate,1971.0,Heist-Aan-Zee,KNOKKE-HEIST,Flandre-Occidentale,Région flamande
24640,8400,apartment,apartment,25000,2,72,1,0,1,5,...,0,0,2,0,good,1994.0,Stene,OOSTENDE,Flandre-Occidentale,Région flamande
24641,8400,apartment,apartment,25000,2,72,1,0,1,5,...,0,0,2,0,good,1994.0,OOSTENDE,OOSTENDE,Flandre-Occidentale,Région flamande
24642,8400,apartment,apartment,25000,2,72,1,0,1,5,...,0,0,2,0,good,1994.0,Zandvoorde,OOSTENDE,Flandre-Occidentale,Région flamande
86546,6740,apartment,kot,14500,1,28,1,0,0,0,...,0,0,4,0,good,2015.0,ETALLE,ETALLE,Luxembourg,Région wallonne
86547,6740,apartment,kot,14500,1,28,1,0,0,0,...,0,0,4,0,good,2015.0,Villers-Sur-Semois,ETALLE,Luxembourg,Région wallonne


###### Number of rooms

In [50]:
immo.number_of_rooms.describe()

count    118723.000000
mean          2.871609
std           2.107251
min           0.000000
25%           2.000000
50%           3.000000
75%           3.000000
max         204.000000
Name: number_of_rooms, dtype: float64

In [51]:
immo.number_of_rooms.value_counts()

3      40110
2      37856
4      18216
1      10547
5       6277
6       2494
0       1654
7        721
8        363
9        137
10       120
11        79
12        31
15        31
14        22
13        18
20        12
16         9
24         7
30         5
204        5
165        4
23         4
18         1
Name: number_of_rooms, dtype: Int64

In [52]:
# Remove number_of_rooms >= 165
immo = immo[immo['number_of_rooms'] < 165]
immo.number_of_rooms.value_counts()

3     40110
2     37856
4     18216
1     10547
5      6277
6      2494
0      1654
7       721
8       363
9       137
10      120
11       79
12       31
15       31
14       22
13       18
20       12
16        9
24        7
30        5
23        4
18        1
Name: number_of_rooms, dtype: Int64

###### House Area

In [53]:
immo.house_area.value_counts()

150     2506
120     2491
100     2375
140     2293
90      2271
        ... 
613        1
710        1
551        1
488        1
1407       1
Name: house_area, Length: 676, dtype: int64

In [54]:
immo.house_area.describe()

count    118714.000000
mean        159.199884
std         138.829265
min           1.000000
25%          95.000000
50%         135.000000
75%         191.000000
max       31700.000000
Name: house_area, dtype: float64

In [55]:
immo[immo['house_area'] < 15]

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region
5445,4540,apartment,apartment,185000,2,11,0,0,1,5,...,0,0,3.0,0,as new,,Ombret,AMAY,Liège,Région wallonne
5446,4540,apartment,apartment,185000,2,11,0,0,1,5,...,0,0,3.0,0,as new,,Flône,AMAY,Liège,Région wallonne
5447,4540,apartment,apartment,185000,2,11,0,0,1,5,...,0,0,3.0,0,as new,,Ampsin,AMAY,Liège,Région wallonne
5448,4540,apartment,apartment,185000,2,11,0,0,1,5,...,0,0,3.0,0,as new,,Jehay,AMAY,Liège,Région wallonne
5449,4540,apartment,apartment,185000,2,11,0,0,1,5,...,0,0,3.0,0,as new,,AMAY,AMAY,Liège,Région wallonne
26368,8400,apartment,apartment,425000,2,1,0,0,1,0,...,0,0,2.0,0,good,1961.0,Stene,OOSTENDE,Flandre-Occidentale,Région flamande
26369,8400,apartment,apartment,425000,2,1,0,0,1,0,...,0,0,2.0,0,good,1961.0,OOSTENDE,OOSTENDE,Flandre-Occidentale,Région flamande
26370,8400,apartment,apartment,425000,2,1,0,0,1,0,...,0,0,2.0,0,good,1961.0,Zandvoorde,OOSTENDE,Flandre-Occidentale,Région flamande
40705,2800,apartment,kot,99000,1,14,0,0,0,0,...,0,0,2.0,0,,2014.0,Walem,MECHELEN,Anvers,Région flamande
40706,2800,apartment,kot,99000,1,14,0,0,0,0,...,0,0,2.0,0,,2014.0,MECHELEN,MECHELEN,Anvers,Région flamande


In [56]:
immo[immo['house_area'] > 1500]

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region
24383,9600,house,house,650000,1,1640,0,0,0,0,...,0,1640,3.0,0,good,1991.0,RENAIX,RENAIX,Flandre-Orientale,Région flamande
26623,8400,house,house,219000,4,2019,0,0,0,0,...,0,165,2.0,0,,,Stene,OOSTENDE,Flandre-Occidentale,Région flamande
26624,8400,house,house,219000,4,2019,0,0,0,0,...,0,165,2.0,0,,,OOSTENDE,OOSTENDE,Flandre-Occidentale,Région flamande
26625,8400,house,house,219000,4,2019,0,0,0,0,...,0,165,2.0,0,,,Zandvoorde,OOSTENDE,Flandre-Occidentale,Région flamande
46420,7050,house,mixed use building,175000,2,1700,0,0,0,0,...,0,1700,4.0,0,to renovate,,JURBISE,JURBISE,Hainaut,Région wallonne
46421,7050,house,mixed use building,175000,2,1700,0,0,0,0,...,0,1700,4.0,0,to renovate,,Herchies,JURBISE,Hainaut,Région wallonne
46422,7050,house,mixed use building,175000,2,1700,0,0,0,0,...,0,1700,4.0,0,to renovate,,Erbisoeul,JURBISE,Hainaut,Région wallonne
46423,7050,house,mixed use building,175000,2,1700,0,0,0,0,...,0,1700,4.0,0,to renovate,,Masnuy-Saint-Jean,JURBISE,Hainaut,Région wallonne
46424,7050,house,mixed use building,175000,2,1700,0,0,0,0,...,0,1700,4.0,0,to renovate,,Masnuy-Saint-Pierre,JURBISE,Hainaut,Région wallonne
46425,7050,house,mixed use building,175000,2,1700,0,0,0,0,...,0,1700,4.0,0,to renovate,,Erbaut,JURBISE,Hainaut,Région wallonne


In [57]:
# Drop too small and too big house_area, taking the price in consideration
immo = immo[(immo['house_area'] > 10) & 
            ((immo['house_area'] < 1000) | ((immo['house_area'] > 1000) & (immo['price'] > 390000)))]

In [58]:
immo.shape

(118646, 21)

###### Surface of the Land

In [59]:
immo.surface_of_the_land.describe()

count    118646.000000
mean        684.000877
std        4434.701449
min           0.000000
25%           0.000000
50%         118.000000
75%         563.750000
max      400000.000000
Name: surface_of_the_land, dtype: float64

In [60]:
immo[immo['surface_of_the_land'] == 0]

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region
5,1050,house,house,590000,4,225,0,0,1,0,...,0,0,2,0,to renovate,,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
6,1050,house,house,575000,4,209,1,0,0,0,...,0,0,2,0,,,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
16,1050,house,house,685000,6,280,1,0,0,0,...,0,0,2,0,,1956,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
25,1050,house,house,795000,4,240,1,0,1,8,...,59,0,2,0,good,1907,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
30,1050,house,house,795000,8,240,1,0,0,0,...,0,0,2,0,good,,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120954,5022,house,house,240000,2,230,0,0,0,0,...,0,0,3,0,as new,1885,Cognelée,NAMUR,Namur,Région wallonne
120968,6221,house,house,210000,2,131,1,0,1,0,...,0,0,3,0,good,,Saint-Amand,FLEURUS,Hainaut,Région wallonne
121001,6723,house,house,385000,3,201,0,0,1,25,...,111,0,2,0,as new,2020,Habay-La-Vieille,HABAY,Luxembourg,Région wallonne
121058,4342,house,house,425000,3,315,1,0,1,124,...,250,0,3,0,,2002,Hognoul,AWANS,Liège,Région wallonne


In [61]:
immo[(immo['surface_of_the_land']<2) & (immo['garden_area']>2)]

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region
25,1050,house,house,795000,4,240,1,0,1,8,...,59,0,2,0,good,1907,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
69,1050,apartment,flat studio,220000,0,49,1,0,0,0,...,23,0,2,0,as new,2019,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
74,1050,apartment,flat studio,220000,0,49,1,0,0,0,...,35,0,,0,as new,2019,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
149,1050,apartment,ground floor,395000,2,91,1,0,1,15,...,20,0,,0,good,1979,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
185,1050,apartment,ground floor,440000,1,85,0,0,1,0,...,70,0,,0,good,,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120309,4560,house,farmhouse,199000,4,195,1,0,0,0,...,500,0,3,0,to be done up,1899,Pailhe,CLAVIER,Liège,Région wallonne
120310,4560,house,farmhouse,199000,4,195,1,0,0,0,...,500,0,3,0,to be done up,1899,Ocquier,CLAVIER,Liège,Région wallonne
120311,4560,house,farmhouse,199000,4,195,1,0,0,0,...,500,0,3,0,to be done up,1899,Terwagne,CLAVIER,Liège,Région wallonne
121001,6723,house,house,385000,3,201,0,0,1,25,...,111,0,2,0,as new,2020,Habay-La-Vieille,HABAY,Luxembourg,Région wallonne


In [62]:
immototal = pd.DataFrame()
immototal = immo[(immo['surface_of_the_land']<2) & (immo['garden_area']>2)]
immototal['surface_of_the_land'] = immototal['house_area'] + immototal['terrace_area'] + immototal['garden_area']
immototal

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  immototal['surface_of_the_land'] = immototal['house_area'] + immototal['terrace_area'] + immototal['garden_area']


Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region
25,1050,house,house,795000,4,240,1,0,1,8,...,59,307,2,0,good,1907,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
69,1050,apartment,flat studio,220000,0,49,1,0,0,0,...,23,72,2,0,as new,2019,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
74,1050,apartment,flat studio,220000,0,49,1,0,0,0,...,35,84,,0,as new,2019,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
149,1050,apartment,ground floor,395000,2,91,1,0,1,15,...,20,126,,0,good,1979,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
185,1050,apartment,ground floor,440000,1,85,0,0,1,0,...,70,155,,0,good,,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120309,4560,house,farmhouse,199000,4,195,1,0,0,0,...,500,695,3,0,to be done up,1899,Pailhe,CLAVIER,Liège,Région wallonne
120310,4560,house,farmhouse,199000,4,195,1,0,0,0,...,500,695,3,0,to be done up,1899,Ocquier,CLAVIER,Liège,Région wallonne
120311,4560,house,farmhouse,199000,4,195,1,0,0,0,...,500,695,3,0,to be done up,1899,Terwagne,CLAVIER,Liège,Région wallonne
121001,6723,house,house,385000,3,201,0,0,1,25,...,111,337,2,0,as new,2020,Habay-La-Vieille,HABAY,Luxembourg,Région wallonne


In [63]:
# Taking off all rows with the surface of the land missing
immo = immo[immo['surface_of_the_land']>5]
immo.shape

(64669, 21)

In [64]:
immo = pd.concat([immo, immototal], axis=0)
immo.shape

(68844, 21)

###### State of the building

In [65]:
# State of the building data has 'None'
immo.state_of_the_building.value_counts()

good              19585
as new            15290
to renovate        6646
to be done up      6544
just renovated     3709
to restore          485
Name: state_of_the_building, dtype: int64

In [66]:
immo['state_of_the_building'] = immo['state_of_the_building'].fillna('unknown')
immo.state_of_the_building.value_counts()

good              19585
unknown           16585
as new            15290
to renovate        6646
to be done up      6544
just renovated     3709
to restore          485
Name: state_of_the_building, dtype: int64

In [67]:
immo.state_of_the_building.describe()

count     68844
unique        7
top        good
freq      19585
Name: state_of_the_building, dtype: object

###### Kitchen

###### Open Fire

###### Terrace

###### Terrace Area

In [68]:
immo.subtype_of_property.value_counts()

house                   50335
villa                    7258
apartment                2097
mixed use building       1922
exceptional property     1224
mansion                  1163
country cottage          1033
ground floor              836
town house                713
bungalow                  666
chalet                    494
farmhouse                 415
manor house               235
duplex                    228
other property            124
penthouse                  29
flat studio                21
loft                       20
service flat               17
triplex                    14
Name: subtype_of_property, dtype: int64

In [69]:
immo[immo['type_of_property'] == 'house'].subtype_of_property.value_counts()

house                   50335
villa                    7258
mixed use building       1922
exceptional property     1224
mansion                  1163
country cottage          1033
town house                713
bungalow                  666
chalet                    494
farmhouse                 415
manor house               235
other property            124
Name: subtype_of_property, dtype: int64

In [70]:
immo[immo['type_of_property'] != 'house'].subtype_of_property.value_counts()

apartment       2097
ground floor     836
duplex           228
penthouse         29
flat studio       21
loft              20
service flat      17
triplex           14
Name: subtype_of_property, dtype: int64

In [71]:
immo[immo['surface_of_the_land'] == 0]

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,fully_equipped_kitchen,open_fire,terrace,terrace_area,...,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,construction_year,locality,commune,province,region


###### Garden

###### Garden Area

###### Number of facades

In [72]:
immo.number_of_facades.value_counts()

4    23027
2    18511
3    17622
1      344
Name: number_of_facades, dtype: Int64

In [73]:
immo.isna().sum()

zip                           0
type_of_property              0
subtype_of_property           0
price                         0
number_of_rooms               0
house_area                    0
fully_equipped_kitchen        0
open_fire                     0
terrace                       0
terrace_area                  0
garden                        0
garden_area                   0
surface_of_the_land           0
number_of_facades          9340
swimming_pool                 0
state_of_the_building         0
construction_year         30157
locality                      0
commune                       0
province                      0
region                        0
dtype: int64

In [74]:
# filling with median value
immo['number_of_facades'] = immo['number_of_facades'].fillna(3)
immo.number_of_facades.value_counts()

3    26962
4    23027
2    18511
1      344
Name: number_of_facades, dtype: Int64

###### Swimming pool

In [75]:
immo.swimming_pool.value_counts()

0    67004
1     1840
Name: swimming_pool, dtype: int64

###### Construction year

In [76]:
immo.shape

(68844, 21)

In [77]:
immo.reset_index(drop=True, inplace=True)
immo.shape

(68844, 21)

In [78]:
immo = immo.drop(["fully_equipped_kitchen", "open_fire", "terrace", "terrace_area", 
                  "garden", "garden_area", "construction_year"], axis=1)
immo

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,locality,commune,province,region
0,1050,house,house,340000,6,203,95,2,0,to be done up,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
1,1050,house,mixed use building,520000,4,200,69,2,0,to renovate,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
2,1050,house,house,599000,4,160,100,2,0,to be done up,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
3,1050,house,house,599000,3,160,130,2,0,good,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
4,1050,house,house,575000,3,171,46,2,0,just renovated,IXELLES,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68839,4560,house,farmhouse,199000,4,195,695,3,0,to be done up,Pailhe,CLAVIER,Liège,Région wallonne
68840,4560,house,farmhouse,199000,4,195,695,3,0,to be done up,Ocquier,CLAVIER,Liège,Région wallonne
68841,4560,house,farmhouse,199000,4,195,695,3,0,to be done up,Terwagne,CLAVIER,Liège,Région wallonne
68842,6723,house,house,385000,3,201,337,2,0,as new,Habay-La-Vieille,HABAY,Luxembourg,Région wallonne


In [79]:
immo.to_csv('../../Datasets/clean_immo.csv', index=False)