# Workflow

In [157]:
# imports
import os

import numpy as np
import pandas as pd

In [158]:
# Paths to the files
filepath = os.getcwd()

price_app_filepath = filepath + "/datasets/announced-prices-apartments-luxembourg-city.xlsx"
price_house_filepath = filepath + "/datasets/announced-prices-houses-luxembourg-city.xlsx"
rent_ap_filepath = filepath + "/datasets/announced-rent-apartments-luxembourg-city.xlsx"
reg_price_filepath = filepath + "/datasets/registered-prices-apartements-by-commune.xlsx"

We open the dataset and start working on the indexing, organizing the data by Quarter and Year.
Since the 3 datasets are similar we can create a function to avoid repeating code.

In [159]:
def clean_index(dataset):
    # ordering the data by quarter and year, creating a multi-index
    arrays = [[*dataset.Quarter], [*dataset.Year]]

    tuples = list(zip(*arrays))

    index =pd.MultiIndex.from_tuples(tuples, names=['Quarter', 'Year'])

    dataset.set_index(index, inplace=True)

    # we don't need the quarter and year since they are part  of the index
    dataset.drop(columns=['Quarter', 'Year'], inplace=True)

    dataset.sort_index(inplace=True)

    dataset.drop(index='Luxembourg City', inplace=True)
    dataset.drop(index='National Average', inplace=True)

We start checking the datatype and we start handling the NaN.

In [160]:
def check_type_missing(dataset, missing, rent=None):
    col1 = 'Number of offers'
    col2 = 'Average announced price in €'
    col3 = 'Average announced price per squared meter in €'

    if rent:
        col2 = 'Average announced rent in €'
        col3 = 'Average announced rent per squared meter in €'

    # easy handling of missing data, may change for better modeling
    val = missing
    dataset.replace('*', val, inplace=True)
    dataset[col1] = dataset[col1].astype('int64')
    dataset[col2] = dataset[col2].astype('float64').round(2)
    dataset[col3] = dataset[col3].astype('float64').round(2)

    # type check
    print(f"{col1 + ':':<50} \
        {str(dataset[col1].dtype)}")
    print(f"{col2 + ':':<50} \
        {str(dataset[col2].dtype)}")
    print(f"{col3 + ':':<50} \
        {str(dataset[col3].dtype)}")

In [161]:
# acquiring the data
price_ap_data = pd.read_excel(price_app_filepath)

# cleaning the indexing
clean_index(price_ap_data)

price_ap_data


Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced price in €,Average announced price per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,495.0,364878,4222
Beggen,2010,508.0,404496,4542
Beggen,2011,372.0,422256,5019
Beggen,2012,160.0,477997,5141
Beggen,2013,183.0,500915,5537
...,...,...,...,...
Weimerskirch,2017,29.0,*,*
Weimerskirch,2018,33.0,961181.82,8335.125
Weimerskirch,2019,5.0,*,*
Weimerskirch,2020,85.0,1005641.4,15153.99


In [162]:
# checking the types and handling missing values
check_type_missing(price_ap_data, 0)
price_ap_data

Number of offers:                                          int64
Average announced price in €:                              float64
Average announced price per squared meter in €:            float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced price in €,Average announced price per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,495,364878.00,4222.00
Beggen,2010,508,404496.00,4542.00
Beggen,2011,372,422256.00,5019.00
Beggen,2012,160,477997.00,5141.00
Beggen,2013,183,500915.00,5537.00
...,...,...,...,...
Weimerskirch,2017,29,0.00,0.00
Weimerskirch,2018,33,961181.82,8335.12
Weimerskirch,2019,5,0.00,0.00
Weimerskirch,2020,85,1005641.40,15153.99


In [163]:
# acquiring the data
price_hous_data = pd.read_excel(price_house_filepath)

# clening the indexing
clean_index(price_hous_data)

price_hous_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced price in €,Average announced price per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,74.0,588378.0,3515.0
Beggen,2010,96.0,618539.0,3761.0
Beggen,2011,76.0,639310.0,3873.0
Beggen,2012,26.0,688769.2307692308,3896.1231837816417
Beggen,2013,28.0,763142.8571428572,4170.1772762621595


In [164]:
# checking types and handling missing data
check_type_missing(price_hous_data, 0)
price_hous_data

Number of offers:                                          int64
Average announced price in €:                              float64
Average announced price per squared meter in €:            float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced price in €,Average announced price per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,74,588378.00,3515.00
Beggen,2010,96,618539.00,3761.00
Beggen,2011,76,639310.00,3873.00
Beggen,2012,26,688769.23,3896.12
Beggen,2013,28,763142.86,4170.18
...,...,...,...,...
Weimerskirch,2017,35,1290428.57,7023.61
Weimerskirch,2018,55,1283943.30,7432.01
Weimerskirch,2019,26,0.00,0.00
Weimerskirch,2020,24,0.00,0.00


In [165]:
# acquiring the data
rent_ap_data = pd.read_excel(rent_ap_filepath)

# cleaning the indexing
clean_index(rent_ap_data)

rent_ap_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced rent in €,Average announced rent per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,231.0,1202,15.19
Beggen,2010,324.0,1093,15.84
Beggen,2011,196.0,1284,16.42
Beggen,2012,107.0,1261,17.02
Beggen,2013,65.0,1186,19.24


In [166]:
check_type_missing(rent_ap_data, 0, rent=True)
rent_ap_data

Number of offers:                                          int64
Average announced rent in €:                               float64
Average announced rent per squared meter in €:             float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced rent in €,Average announced rent per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,231,1202.00,15.19
Beggen,2010,324,1093.00,15.84
Beggen,2011,196,1284.00,16.42
Beggen,2012,107,1261.00,17.02
Beggen,2013,65,1186.00,19.24
...,...,...,...,...
Weimerskirch,2017,26,0.00,0.00
Weimerskirch,2018,54,1500.19,26.76
Weimerskirch,2019,87,1415.40,40.99
Weimerskirch,2020,96,1429.38,39.01


In [167]:
reg_price_data = pd.read_excel(reg_price_filepath)
reg_price_data

Unnamed: 0.1,Unnamed: 0,Sales of already constructed apartments,Unnamed: 2,Unnamed: 3,Sales of apartments still under construction (Ventes en Etat Futur D'Achevement [VEFA]),Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Commune,Number of sales,Average registered price per squared meter in €,Price range for price per squared meter,Number of sales,Average registered price per squared meter in €,Price range for price per squared meter,Year
1,Beaufort,3,*,*,0,*,*,2009
2,Bech,0,*,*,0,*,*,2009
3,Beckerich,3,*,*,0,*,*,2009
4,Berdorf,3,*,*,1,*,*,2009
...,...,...,...,...,...,...,...,...
1368,Wiltz,26,4318.5469999999996,2904 € - 6132 €,39,4745.8869999999997,3753 € - 5622 €,2021
1369,Wincrange,5,*,*,4,*,*,2021
1370,Winseler,6,*,*,6,*,*,2021
1371,Wormeldange,18,5798.1880000000001,4003 € - 7510 €,8,*,*,2021


In [168]:
tuples1 = []
for el in reg_price_data.iloc[0][1:4]:
    tuples1.append(("Constructed", el))
for el in reg_price_data.iloc[0][1:4]:
    tuples1.append(("VEFA", el))

print(tuples1)

new_header = reg_price_data.iloc[0]
reg_price_data.columns = new_header
reg_price_data = reg_price_data[1:]

reg_price_data

[('Constructed', 'Number of sales'), ('Constructed', 'Average registered price per squared meter in €'), ('Constructed', 'Price range for price per squared meter'), ('VEFA', 'Number of sales'), ('VEFA', 'Average registered price per squared meter in €'), ('VEFA', 'Price range for price per squared meter')]


Unnamed: 0,Commune,Number of sales,Average registered price per squared meter in €,Price range for price per squared meter,Number of sales.1,Average registered price per squared meter in €.1,Price range for price per squared meter.1,Year
1,Beaufort,3,*,*,0,*,*,2009
2,Bech,0,*,*,0,*,*,2009
3,Beckerich,3,*,*,0,*,*,2009
4,Berdorf,3,*,*,1,*,*,2009
5,Bertrange,27,4050.6390000000001,2478 € - 5653 €,23,5224.1819999999998,4027 € - 5745 €,2009
...,...,...,...,...,...,...,...,...
1368,Wiltz,26,4318.5469999999996,2904 € - 6132 €,39,4745.8869999999997,3753 € - 5622 €,2021
1369,Wincrange,5,*,*,4,*,*,2021
1370,Winseler,6,*,*,6,*,*,2021
1371,Wormeldange,18,5798.1880000000001,4003 € - 7510 €,8,*,*,2021


In [169]:
# ordering the data by quarter and year, creating a multi-index
arrays = [[*reg_price_data['Commune']], [*reg_price_data.Year]]
tuples = list(zip(*arrays))
index =pd.MultiIndex.from_tuples(tuples, names=['Commune', 'Year'])
reg_price_data.set_index(index, inplace=True)

# we don't need the Commune and year since they are part  of the index
reg_price_data.drop(columns=['Commune', 'Year'], inplace=True)

reg_price_data.sort_index(inplace=True)
reg_price_data.drop(index='National Average', inplace=True)

index = pd.MultiIndex.from_tuples(tuples1, names=["Construction State", "Detail"])
reg_price_data.columns = index

reg_price_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().sort_index(


Unnamed: 0_level_0,Construction State,Constructed,Constructed,Constructed,VEFA,VEFA,VEFA
Unnamed: 0_level_1,Detail,Number of sales,Average registered price per squared meter in €,Price range for price per squared meter,Number of sales,Average registered price per squared meter in €,Price range for price per squared meter
Commune,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Beaufort,2009,3,*,*,0,*,*
Beaufort,2010,2,*,*,0,*,*
Beaufort,2011,4,*,*,0,*,*
Beaufort,2012,8,*,*,0,*,*
Beaufort,2013,1,*,*,0,*,*
...,...,...,...,...,...,...,...
Wormeldange,2017,9,*,*,0,*,*
Wormeldange,2018,11,4120.3959999999997,3136 € - 4943 €,0,*,*
Wormeldange,2019,13,5201.6409999999996,3246 € - 8498 €,13,5693.9480000000003,5014 € - 6660 €
Wormeldange,2020,14,5848.607,4003 € - 7510 €,13,7283.857,5934 € - 8671 €


In [173]:
reg_price_data.loc[:]["Constructed", "Price range for price per squared meter"]
# TODO parse the range constructed
# TODO parse the range VEFA

Commune      Year
Beaufort     2009                  *
             2010                  *
             2011                  *
             2012                  *
             2013                  *
                          ...       
Wormeldange  2017                  *
             2018    3136 € - 4943 €
             2019    3246 € - 8498 €
             2020    4003 € - 7510 €
             2021    4003 € - 7510 €
Name: (Constructed, Price range for price per squared meter), Length: 1359, dtype: object