In [2]:
import pandas as pd
df_init = pd.read_csv('csv-data\\scraping_results.csv')
df_init.head()

Unnamed: 0,Locality,Zip Code,Type of Property,Subtype of Property,Price,Type of Sale,Number of Rooms,Livable Space (m2),Fully Equipped Kitchen,Furnished,...,Garden,Garden Area (m2),Swimming Pool,Surface of the Land (m2),Number of Facades,Construction Year,PEB,Primary Energy Consumption (kWh/m2),State of the Building,Url
0,Gent,9000,Apartment,apartment,229000.0,for-sale,1.0,44,0,0,...,0,,0,,2.0,1918.0,B,190.0,Good,https://www.immoweb.be/en/classified/apartment...
1,Uccle,1180,House,villa,1250000.0,for-sale,4.0,584,1,0,...,1,1085.0,0,1225.0,4.0,1932.0,G,402.0,To renovate,https://www.immoweb.be/en/classified/villa/for...
2,Etterbeek,1040,House,house,875000.0,for-sale,4.0,233,1,0,...,0,,0,112.0,3.0,1929.0,G,563.0,To be done up,https://www.immoweb.be/en/classified/house/for...
3,Elversele,9140,House,villa,825000.0,for-sale,4.0,281,0,0,...,0,,1,1450.0,4.0,2001.0,B,181.0,As new,https://www.immoweb.be/en/classified/villa/for...
4,Kalmthout,2920,House,manor-house,4750000.0,for-sale,3.0,326,0,0,...,0,,0,34799.0,4.0,2015.0,A,95.0,As new,https://www.immoweb.be/en/classified/manor-hou...


In [3]:
df_init.shape

(10875, 23)

It was noticed that some properties were presented on different web pages, so we removed these duplicates by comparing the values ​​for all columns except the URL column.

In [4]:
#Count duplicates for the same properties that have different URL.
df_init.duplicated(subset=df_init.columns.difference(['Url'])).value_counts()

False    10725
True       150
Name: count, dtype: int64

In [5]:
#Remove duplicates for the same properties that have different URL.
df_init.drop_duplicates(subset=df_init.columns.difference(['Url']), inplace=True)

In [6]:
df_init = df_init.drop(columns=['Url'])
df_init = df_init.drop(columns=['Construction Year'])
df_init = df_init.drop(columns=['Type of Sale'])
df_init.shape


(10725, 20)

**How many rows and columns?**


Initially we had a dataset with 20 columns and 10725 inputs.

In [7]:
print(df_init.info())

<class 'pandas.core.frame.DataFrame'>
Index: 10725 entries, 0 to 10874
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10725 non-null  object 
 1   Zip Code                             10725 non-null  int64  
 2   Type of Property                     10725 non-null  object 
 3   Subtype of Property                  10725 non-null  object 
 4   Price                                10701 non-null  float64
 5   Number of Rooms                      10598 non-null  float64
 6   Livable Space (m2)                   10725 non-null  int64  
 7   Fully Equipped Kitchen               10725 non-null  int64  
 8   Furnished                            10725 non-null  int64  
 9   Any Fireplace ?                      10725 non-null  int64  
 10  Terrace                              10725 non-null  int64  
 11  Terrace Area (m2)                

We had some inputs that were missing price (24 cases), so we removed cases with 0 price as they would not be useful in further modeling.

In [8]:
percentage_of_missing_price = round(df_init['Price'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing values for price: {percentage_of_missing_price}%")

Percentage of missing values for price: 0.22%


In [9]:
df = df_init.dropna(subset=['Price'], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10701 entries, 0 to 10700
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10701 non-null  object 
 1   Zip Code                             10701 non-null  int64  
 2   Type of Property                     10701 non-null  object 
 3   Subtype of Property                  10701 non-null  object 
 4   Price                                10701 non-null  float64
 5   Number of Rooms                      10574 non-null  float64
 6   Livable Space (m2)                   10701 non-null  int64  
 7   Fully Equipped Kitchen               10701 non-null  int64  
 8   Furnished                            10701 non-null  int64  
 9   Any Fireplace ?                      10701 non-null  int64  
 10  Terrace                              10701 non-null  int64  
 11  Terrace Area (m2)           

We had some inputs that were missing number of rooms, so we fill these rows with 0.

In [10]:
percentage_of_missing_room_number = round(df['Number of Rooms'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing number of rooms: {percentage_of_missing_room_number}%")

Percentage of missing number of rooms: 1.18%


In [11]:

df['Number of Rooms'] = df['Number of Rooms'].fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10701 entries, 0 to 10700
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10701 non-null  object 
 1   Zip Code                             10701 non-null  int64  
 2   Type of Property                     10701 non-null  object 
 3   Subtype of Property                  10701 non-null  object 
 4   Price                                10701 non-null  float64
 5   Number of Rooms                      10701 non-null  float64
 6   Livable Space (m2)                   10701 non-null  int64  
 7   Fully Equipped Kitchen               10701 non-null  int64  
 8   Furnished                            10701 non-null  int64  
 9   Any Fireplace ?                      10701 non-null  int64  
 10  Terrace                              10701 non-null  int64  
 11  Terrace Area (m2)           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Number of Rooms'] = df['Number of Rooms'].fillna(0)


We had many missing values ​​for columns like Garden Area, Terrace Area and Surface of the land. So we filled these missing values ​​with 0 as the fact that these areas do not exist.

In [12]:
percentage_of_missing_garden_area = round(df['Garden Area (m2)'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing garden area values: {percentage_of_missing_garden_area}%")
#print(f"Percentage of missing garden area values: {percentage_of_missing_garden_area:.2f}%")

percentage_of_missing_terrace_area = round(df['Terrace Area (m2)'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing terrace area values: {percentage_of_missing_terrace_area}%")

percentage_of_missing_land_area = round(df['Surface of the Land (m2)'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing values for surface of the land: {percentage_of_missing_land_area}%")

Percentage of missing garden area values: 69.29%
Percentage of missing terrace area values: 58.32%
Percentage of missing values for surface of the land: 34.35%


In [13]:
# Replace missing values in the 'Terrace Area (m2)' columns with 0
df['Terrace Area (m2)'] = df['Terrace Area (m2)'].fillna(0)

# Replace missing values in the 'Garden Area (m2)' columns with 0
df['Garden Area (m2)'] = df['Garden Area (m2)'].fillna(0)

# Replace missing values in the 'Surface of the Land (m2)' columns with 0
df['Surface of the Land (m2)'] = df['Surface of the Land (m2)'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Terrace Area (m2)'] = df['Terrace Area (m2)'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Garden Area (m2)'] = df['Garden Area (m2)'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Surface of the Land (m2)'] = df['Surface of the Land (m2)'].fillna(0)


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10701 entries, 0 to 10700
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10701 non-null  object 
 1   Zip Code                             10701 non-null  int64  
 2   Type of Property                     10701 non-null  object 
 3   Subtype of Property                  10701 non-null  object 
 4   Price                                10701 non-null  float64
 5   Number of Rooms                      10701 non-null  float64
 6   Livable Space (m2)                   10701 non-null  int64  
 7   Fully Equipped Kitchen               10701 non-null  int64  
 8   Furnished                            10701 non-null  int64  
 9   Any Fireplace ?                      10701 non-null  int64  
 10  Terrace                              10701 non-null  int64  
 11  Terrace Area (m2)           

We had 23.15% missing values ​​for the Number of Facades column. It was decided to modify this data according to the following rule:
1. Replace missing values in the 'Number of Facades' columns with 0
2. Replace 'Number of facades' with 1 where 'Type of Property' is apartment and 'Number of facades' is 0
3. Replace 'Number of Facades' with 2 where 'Subtype of Property' is 'duplex' or 'town-house' and 'Number of Facades' is 0
4. Replace 'Number of Facades' with 4 where 'Type of Property' is House and 'Number of Facades' is 0

In [15]:
percentage_of_missing_number_facades = round(df['Number of Facades'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing values for number of facades: {percentage_of_missing_number_facades}%")

Percentage of missing values for number of facades: 23.15%


In [16]:
# Replace missing values in the 'Number of Facades' columns with 0
df['Number of Facades'] = df['Number of Facades'].fillna(0)

# Replace 'Number of facades' with 1 where 'Type of Property' is 0 (apartment) and 'Number of facades' is 0
df.loc[(df['Type of Property'] == 0) & (df['Number of Facades'] == 'Apartment'), 'Number of Facades'] = 1

# Replace 'Number of Facades' with 2 where 'Subtype of Property' is 'duplex' or 'town-house' and 'Number of Facades' is 0
df.loc[(df['Subtype of Property'].isin(['duplex', 'town-house'])) & (df['Number of Facades'] == 0), 'Number of Facades'] = 2

# Replace remaining 'Number of Facades' 0 values for remaining houses with 4
# Replace 'Number of Facades' with 4 where 'Type of Property' is 1 (House) and 'Number of Facades' is 0
df.loc[(df['Type of Property'] == 1) & (df['Number of Facades'] == 'House'), 'Number of Facades'] = 4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Number of Facades'] = df['Number of Facades'].fillna(0)


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10701 entries, 0 to 10700
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10701 non-null  object 
 1   Zip Code                             10701 non-null  int64  
 2   Type of Property                     10701 non-null  object 
 3   Subtype of Property                  10701 non-null  object 
 4   Price                                10701 non-null  float64
 5   Number of Rooms                      10701 non-null  float64
 6   Livable Space (m2)                   10701 non-null  int64  
 7   Fully Equipped Kitchen               10701 non-null  int64  
 8   Furnished                            10701 non-null  int64  
 9   Any Fireplace ?                      10701 non-null  int64  
 10  Terrace                              10701 non-null  int64  
 11  Terrace Area (m2)           

We had 5.94% and 12.28% missing values ​​for the PEB and State of the Building columns respectively, so we replaced these cases by indicating that the information was not specified.

In [18]:
percentage_of_missing_peb = round(df['PEB'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing PEB values: {percentage_of_missing_peb}%")

percentage_of_missing_building_state = round(df['State of the Building'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing values for state of the building: {percentage_of_missing_building_state}%")

Percentage of missing PEB values: 5.94%
Percentage of missing values for state of the building: 12.28%


In [19]:
df['PEB'] = df['PEB'].fillna('Not specified')
df['State of the Building'] = df['State of the Building'].fillna('Not specified')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PEB'] = df['PEB'].fillna('Not specified')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['State of the Building'] = df['State of the Building'].fillna('Not specified')


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10701 entries, 0 to 10700
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10701 non-null  object 
 1   Zip Code                             10701 non-null  int64  
 2   Type of Property                     10701 non-null  object 
 3   Subtype of Property                  10701 non-null  object 
 4   Price                                10701 non-null  float64
 5   Number of Rooms                      10701 non-null  float64
 6   Livable Space (m2)                   10701 non-null  int64  
 7   Fully Equipped Kitchen               10701 non-null  int64  
 8   Furnished                            10701 non-null  int64  
 9   Any Fireplace ?                      10701 non-null  int64  
 10  Terrace                              10701 non-null  int64  
 11  Terrace Area (m2)           

We had 10.98% missing values for the Energy consumption column. It was decided to replace them with the mean value of the entire range.

In [21]:
percentage_of_missing_energy_consumption = round(df['Primary Energy Consumption (kWh/m2)'].isna().sum() / len(df_init) * 100, 2)
print(f"Percentage of missing values for energy consumption: {percentage_of_missing_energy_consumption}%")

Percentage of missing values for energy consumption: 10.85%


In [22]:
# Replace missing values in the 'Primary Energy Consumption (kWh/m2)' columns with mean value
df['Primary Energy Consumption (kWh/m2)'] = df['Primary Energy Consumption (kWh/m2)'].fillna(round(df['Primary Energy Consumption (kWh/m2)'].mean(),0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Primary Energy Consumption (kWh/m2)'] = df['Primary Energy Consumption (kWh/m2)'].fillna(round(df['Primary Energy Consumption (kWh/m2)'].mean(),0))


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10701 entries, 0 to 10700
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10701 non-null  object 
 1   Zip Code                             10701 non-null  int64  
 2   Type of Property                     10701 non-null  object 
 3   Subtype of Property                  10701 non-null  object 
 4   Price                                10701 non-null  float64
 5   Number of Rooms                      10701 non-null  float64
 6   Livable Space (m2)                   10701 non-null  int64  
 7   Fully Equipped Kitchen               10701 non-null  int64  
 8   Furnished                            10701 non-null  int64  
 9   Any Fireplace ?                      10701 non-null  int64  
 10  Terrace                              10701 non-null  int64  
 11  Terrace Area (m2)           

In [24]:
# Find the index of the current 'Any Fireplace ?' column
fireplaces_index = df.columns.get_loc('Any Fireplace ?')

# Create the new 'Open Fire' column based on 'Any Fireplace ?'
df['Open Fire'] = df['Any Fireplace ?'].apply(lambda x: 0 if x == 0 else 1)

# Insert the new 'Open Fire' column at the same index
df.insert(fireplaces_index, 'Open Fire', df.pop('Open Fire'))
df = df.drop(columns=['Any Fireplace ?'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Open Fire'] = df['Any Fireplace ?'].apply(lambda x: 0 if x == 0 else 1)


In [25]:
df.columns

Index(['Locality', 'Zip Code', 'Type of Property', 'Subtype of Property',
       'Price', 'Number of Rooms', 'Livable Space (m2)',
       'Fully Equipped Kitchen', 'Furnished', 'Open Fire', 'Terrace',
       'Terrace Area (m2)', 'Garden', 'Garden Area (m2)', 'Swimming Pool',
       'Surface of the Land (m2)', 'Number of Facades', 'PEB',
       'Primary Energy Consumption (kWh/m2)', 'State of the Building'],
      dtype='object')

For the Fully Equipped Kitchen column it was mentioned a value of 1 for Hyper-Equipped or Installed, and 0 for the rest.

We added a Province column using a function based on the zip (postal) code values.

In [26]:
def add_province(zip):
    first_two_digits = int(str(zip)[:2])
    if 10 <= first_two_digits <= 12:
        return "Brussels-Capital Region"
    elif 13 <= first_two_digits <= 14:
        return "Province of Walloon Brabant"
    elif 15 <= first_two_digits <= 19 or 30 <= first_two_digits <= 34:
        return "Province of Flemish Brabant"
    elif 20 <= first_two_digits <= 29:
        return "Province of Antwerp"
    elif 35 <= first_two_digits <= 39:
        return "Province of Limburg"
    elif 40 <= first_two_digits <= 49:
        return "Province of Liège"
    elif 66 <= first_two_digits <= 69:
        return "Province of Luxembourg"
    elif 50 <= first_two_digits <= 56:
        return "Province of Namur"
    elif 60 <= first_two_digits <= 65 or 70 <= first_two_digits <= 79:
        return "Province of Hainaut"
    elif 80 <= first_two_digits <= 89:
        return "Province of West Flanders"
    elif 90 <= first_two_digits <= 99:
        return "Province of East Flanders"
    else:
        return "Unknown"

df['Province'] = df['Zip Code'].apply(add_province)

zip_code_index = df.columns.get_loc('Zip Code')  # Get the index of 'Zip Code'
df.insert(zip_code_index + 1, 'Province', df.pop('Province'))  # Insert 'Province' at the correct position

In [27]:
df.columns

Index(['Locality', 'Zip Code', 'Province', 'Type of Property',
       'Subtype of Property', 'Price', 'Number of Rooms', 'Livable Space (m2)',
       'Fully Equipped Kitchen', 'Furnished', 'Open Fire', 'Terrace',
       'Terrace Area (m2)', 'Garden', 'Garden Area (m2)', 'Swimming Pool',
       'Surface of the Land (m2)', 'Number of Facades', 'PEB',
       'Primary Energy Consumption (kWh/m2)', 'State of the Building'],
      dtype='object')

In [28]:
df.shape

(10701, 21)

In [29]:
df['Zip Code'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10701 entries, 0 to 10700
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Locality                             10701 non-null  object 
 1   Zip Code                             10701 non-null  int64  
 2   Province                             10701 non-null  object 
 3   Type of Property                     10701 non-null  object 
 4   Subtype of Property                  10701 non-null  object 
 5   Price                                10701 non-null  float64
 6   Number of Rooms                      10701 non-null  float64
 7   Livable Space (m2)                   10701 non-null  int64  
 8   Fully Equipped Kitchen               10701 non-null  int64  
 9   Furnished                            10701 non-null  int64  
 10  Open Fire                            10701 non-null  int64  
 11  Terrace                     

So we have:    

7 categorical columns ('Locality', 'Zip Code', 'Province', 'Type of Property', 'Subtype of Property', 'PEB', 'State of the Building'),    

8 numericals columns ('Price', 'Number of Rooms', 'Livable Space (m2)', 'Terrace Area (m2)', 'Garden Area (m2)', 'Surface of the Land (m2)', 'Number of Facades', 'Primary Energy Consumption (kWh/m2)'), 
  
6 boolean columns ('Fully Equipped Kitchen', 'Furnished', 'Open Fire', 'Terrace', 'Garden', 'Swimming Pool').

In [30]:
# Count the number of qualitative and quantitative variables
qualitative_vars = df.select_dtypes(include='object').shape[1]
quantitative_vars = df.select_dtypes(include=['int64', 'float64']).shape[1]
print(f"Qualitative variables: {qualitative_vars}")
print(f"Quantitative variables: {quantitative_vars}")

Qualitative variables: 6
Quantitative variables: 15


In [31]:
df.describe()

Unnamed: 0,Zip Code,Price,Number of Rooms,Livable Space (m2),Fully Equipped Kitchen,Furnished,Open Fire,Terrace,Terrace Area (m2),Garden,Garden Area (m2),Swimming Pool,Surface of the Land (m2),Number of Facades,Primary Energy Consumption (kWh/m2)
count,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0,10701.0
mean,4685.047192,747139.6,3.590973,251.505,0.600037,0.03523,0.090178,0.415475,15.017568,0.305579,329.392206,0.074853,1559.534062,2.295393,317.233436
std,3254.051241,849209.4,2.175236,225.919926,0.489913,0.18437,0.286451,0.492827,62.273548,0.460674,1908.529461,0.263166,6466.72267,1.475826,1390.105382
min,1000.0,27500.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1640.0,279000.0,2.0,111.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,167.0
50%,3320.0,525000.0,3.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,240.0,2.0,257.0
75%,8400.0,850000.0,4.0,315.0,1.0,0.0,0.0,1.0,15.0,1.0,60.0,0.0,1000.0,4.0,334.0
max,9992.0,10500000.0,40.0,5596.0,1.0,1.0,1.0,1.0,3749.0,1.0,95000.0,1.0,160737.0,9.0,100000.0


In [32]:
df_statistics = df.describe(include = 'all')
print(df_statistics)

         Locality      Zip Code                 Province Type of Property  \
count       10701  10701.000000                    10701            10701   
unique       1376           NaN                       11                2   
top     Antwerpen           NaN  Brussels-Capital Region            House   
freq          429           NaN                     1952             7398   
mean          NaN   4685.047192                      NaN              NaN   
std           NaN   3254.051241                      NaN              NaN   
min           NaN   1000.000000                      NaN              NaN   
25%           NaN   1640.000000                      NaN              NaN   
50%           NaN   3320.000000                      NaN              NaN   
75%           NaN   8400.000000                      NaN              NaN   
max           NaN   9992.000000                      NaN              NaN   

       Subtype of Property         Price  Number of Rooms  Livable Space (m

We should transform some categorical values as "Province", "Property Type", "PEB", "Building Condition" into quantitive values.

In [33]:
state_mapping = {
    'To restore': 1,
    'To renovate': 2,
    'To be done up': 3,
    'Good': 4,
    'Just renovated': 5,
    'As new': 6,
    'Not specified' : 7 # For 'Not specified' a new group
}

df['State of the Building_Numeric'] = df['State of the Building'].replace(state_mapping)

  df['State of the Building_Numeric'] = df['State of the Building'].replace(state_mapping)


In [34]:
PEB_mapping = {
    'Not specified' : 8,
    'G': 7,
    'F': 6,
    'E': 5,
    'D': 4,
    'C': 3,
    'B': 2,
    'A': 1
}

df['PEB_Numeric'] = df['PEB'].replace(PEB_mapping)

  df['PEB_Numeric'] = df['PEB'].replace(PEB_mapping)


In [35]:
df['Province'].value_counts()

Province
Brussels-Capital Region        1952
Province of Antwerp            1762
Province of East Flanders      1719
Province of West Flanders      1645
Province of Flemish Brabant    1093
Province of Liège               865
Province of Walloon Brabant     589
Province of Luxembourg          331
Province of Hainaut             325
Province of Namur               244
Province of Limburg             176
Name: count, dtype: int64

In [36]:
province_mapping = {
    'Brussels-Capital Region': 1,
    'Province of Antwerp': 2,
    'Province of East Flanders': 3,
    'Province of West Flanders': 4,
    'Province of Flemish Brabant': 5,
    'Province of Liège': 6,
    'Province of Walloon Brabant': 7,
    'Province of Luxembourg': 8,
    'Province of Hainaut': 9,
    'Province of Namur': 10,
    'Province of Limburg': 11,
}

df['Province_Numeric'] = df['Province'].replace(province_mapping)

  df['Province_Numeric'] = df['Province'].replace(province_mapping)


In [37]:
df['Type of Property_Numeric'] = df['Type of Property'].replace({'House': 1, 'Apartment': 0}).infer_objects(copy=False)

  df['Type of Property_Numeric'] = df['Type of Property'].replace({'House': 1, 'Apartment': 0}).infer_objects(copy=False)


In [38]:
df['Subtype of Property'].value_counts()

subtype_group_mapping = {
    'house': 'Houses',
    'villa': 'Luxury Properties',
    'town-house': 'Houses',
    'bungalow': 'Houses',
    'farmhouse': 'Houses',
    'country-cottage': 'Houses',
    'chalet': 'Luxury Properties',
    'apartment': 'Apartments',
    'apartment-block': 'Specialized Properties',
    'duplex': 'Apartments',
    'penthouse': 'Apartments',
    'ground-floor': 'Apartments',
    'flat-studio': 'Apartments',
    'triplex': 'Apartments',
    'service-flat': 'Apartments',
    'mansion': 'Luxury Properties',
    'exceptional-property': 'Luxury Properties',
    'castle': 'Luxury Properties',
    'manor-house': 'Luxury Properties',
    'mixed-use-building': 'Specialized Properties',
    'loft': 'Specialized Properties',
    'kot': 'Apartments',
    'other-property': 'Specialized Properties'
}

df['Subtype of Property_Grouped'] = df['Subtype of Property'].replace(subtype_group_mapping)

df['Subtype of Property_Grouped'].value_counts()

Subtype of Property_Grouped
Houses                    4622
Apartments                3254
Luxury Properties         2016
Specialized Properties     809
Name: count, dtype: int64

In [39]:
subtype_mapping = {
    'Apartments' : 1,
    'Houses' : 2,
    'Specialized Properties' : 3,
    'Luxury Properties' : 4
}

df['Subtype of Property_Grouped_Numeric'] = df['Subtype of Property_Grouped'].replace(subtype_mapping)

  df['Subtype of Property_Grouped_Numeric'] = df['Subtype of Property_Grouped'].replace(subtype_mapping)


In [40]:
df.shape

(10701, 27)

In [42]:
df.columns

Index(['Locality', 'Zip Code', 'Province', 'Type of Property',
       'Subtype of Property', 'Price', 'Number of Rooms', 'Livable Space (m2)',
       'Fully Equipped Kitchen', 'Furnished', 'Open Fire', 'Terrace',
       'Terrace Area (m2)', 'Garden', 'Garden Area (m2)', 'Swimming Pool',
       'Surface of the Land (m2)', 'Number of Facades', 'PEB',
       'Primary Energy Consumption (kWh/m2)', 'State of the Building',
       'State of the Building_Numeric', 'PEB_Numeric', 'Province_Numeric',
       'Type of Property_Numeric', 'Subtype of Property_Grouped',
       'Subtype of Property_Grouped_Numeric'],
      dtype='object')

In [41]:
df.to_csv('csv-data\\cleaned_dataset_analysis.csv', index=False)