# Workflow

In [90]:
# imports
import os

import numpy as np
import pandas as pd

In [91]:
# Paths to the files
filepath = os.getcwd()

price_app_filepath = filepath + "/datasets/announced-prices-apartments-luxembourg-city.xlsx"
price_house_filepath = filepath + "/datasets/announced-prices-houses-luxembourg-city.xlsx"
rent_ap_filepath = filepath + "/datasets/announced-rent-apartments-luxembourg-city.xlsx"
reg_price_filepath = filepath + "/datasets/registered-prices-apartements-by-commune.xlsx"

We open the dataset and start working on the indexing, organizing the data by Quarter and Year.
Since the first 3 datasets are similar we can create a function to avoid repeating code.

In [92]:
def clean_index(dataset):
    # ordering the data by quarter and year, creating a multi-index
    arrays = [[*dataset.Quarter], [*dataset.Year]]

    tuples = list(zip(*arrays))

    index =pd.MultiIndex.from_tuples(tuples, names=['Quarter', 'Year'])

    dataset.set_index(index, inplace=True)

    # we don't need the quarter and year since they are part  of the index
    dataset.drop(columns=['Quarter', 'Year'], inplace=True)

    dataset.sort_index(inplace=True)

    # we don't want to waste any data so we will return these info
    ret = [dataset.loc['Luxembourg City'], dataset.loc['National Average']]

    dataset.drop(index='Luxembourg City', inplace=True)
    dataset.drop(index='National Average', inplace=True)

    # returning this data will be usefull to handle the missing data
    return ret

At this point we make a function to handle the missing data. I will fill the missing data with the average on the city of Luxembourg for the given year.

It is also possible to use the national average using instead of `missing[0]` the other index `missing[1]`.

In [93]:
def check_type_missing(dataset, missing, rent=None):
    col1 = 'Number of offers'
    col2 = 'Average announced price in €'
    col3 = 'Average announced price per squared meter in €'

    if rent:
        col2 = 'Average announced rent in €'
        col3 = 'Average announced rent per squared meter in €'

    # We handle the missing data replacing it with the average in the city of luxemburg
    # in the specific year
    for (commune, year) in dataset.index:
        if dataset.loc[(commune, year), col2] == '*':
            dataset.loc[(commune, year), col2] = missing[0].loc[year, col2]
        if dataset.loc[(commune, year), col3] == '*':
            dataset.loc[(commune, year), col3] = missing[0].loc[year, col3]


    # casting the data type properly
    dataset.loc[:,col1] = dataset.loc[:,col1].astype('int64')
    dataset.loc[:,col2] = dataset.loc[:,col2].astype('float64').round(2)
    dataset.loc[:,col3] = dataset.loc[:,col3].astype('float64').round(2)

    # printing the new types
    print(f"{col1 + ':':<50} \
        {str(dataset.loc[:,col1].dtype)}")
    print(f"{col2 + ':':<50} \
        {str(dataset.loc[:,col2].dtype)}")
    print(f"{col3 + ':':<50} \
        {str(dataset.loc[:,col3].dtype)}")

We apply the functions we've defined on the first 3 datasets.

In [94]:
# acquiring the data
price_ap_data = pd.read_excel(price_app_filepath)

price_ap_data

Unnamed: 0,Quarter,Number of offers,Average announced price in €,Average announced price per squared meter in €,Year
0,Beggen,495.0,364878,4222,2009
1,Belair,711.0,519909,5675,2009
2,Bonnevoie,804.0,323130,4124,2009
3,Cents,141.0,487993,5110,2009
4,Cessange,425.0,430093,4575,2009
...,...,...,...,...,...
320,Rollingergrund,152.0,960414.79,11234.22,2021
321,Ville-Haute,207.0,1000227.6,11740.79,2021
322,Weimerskirch,164.0,956075.03,16124.5,2021
323,Luxembourg City,,1003203.5,12576.54,2021


In [95]:
# cleaning the indexing
temp =  clean_index(price_ap_data)

In [96]:
# checking the types and handling missing values
check_type_missing(price_ap_data, temp)
price_ap_data

Number of offers:                                          int64
Average announced price in €:                              float64
Average announced price per squared meter in €:            float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced price in €,Average announced price per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,495,364878.00,4222.00
Beggen,2010,508,404496.00,4542.00
Beggen,2011,372,422256.00,5019.00
Beggen,2012,160,477997.00,5141.00
Beggen,2013,183,500915.00,5537.00
...,...,...,...,...
Weimerskirch,2017,29,677440.51,8236.53
Weimerskirch,2018,33,961181.82,8335.12
Weimerskirch,2019,5,839313.40,11257.35
Weimerskirch,2020,85,1005641.40,15153.99


In [97]:
# acquiring the data
price_hous_data = pd.read_excel(price_house_filepath)

# cleaning the indexing
temp = clean_index(price_hous_data)

price_hous_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced price in €,Average announced price per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,74.0,588378,3515
Beggen,2010,96.0,618539,3761
Beggen,2011,76.0,639310,3873
Beggen,2012,26.0,688769.23076923075,3896.1231837816422
Beggen,2013,28.0,763142.85714285716,4170.1772762621586
...,...,...,...,...
Weimerskirch,2017,35.0,1290428.571,7023.6099539999996
Weimerskirch,2018,55.0,1283943.3,7432.0079999999998
Weimerskirch,2019,26.0,*,*
Weimerskirch,2020,24.0,*,*


In [98]:
# checking types and handling missing data
check_type_missing(price_hous_data, temp)
price_hous_data

Number of offers:                                          int64
Average announced price in €:                              float64
Average announced price per squared meter in €:            float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced price in €,Average announced price per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,74,588378.00,3515.00
Beggen,2010,96,618539.00,3761.00
Beggen,2011,76,639310.00,3873.00
Beggen,2012,26,688769.23,3896.12
Beggen,2013,28,763142.86,4170.18
...,...,...,...,...
Weimerskirch,2017,35,1290428.57,7023.61
Weimerskirch,2018,55,1283943.30,7432.01
Weimerskirch,2019,26,1576213.60,8078.17
Weimerskirch,2020,24,1606490.20,9527.81


In [99]:
# acquiring the data
rent_ap_data = pd.read_excel(rent_ap_filepath)

# cleaning the indexing
temp = clean_index(rent_ap_data)

rent_ap_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced rent in €,Average announced rent per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,231.0,1202,15.19
Beggen,2010,324.0,1093,15.84
Beggen,2011,196.0,1284,16.420000000000002
Beggen,2012,107.0,1261,17.02
Beggen,2013,65.0,1186,19.239999999999998
...,...,...,...,...
Weimerskirch,2017,26.0,*,*
Weimerskirch,2018,54.0,1500.1851999999999,26.764150000000001
Weimerskirch,2019,87.0,1415.4023,40.986260000000001
Weimerskirch,2020,96.0,1429.375,39.013120000000001


In [100]:
check_type_missing(rent_ap_data, temp, rent=True)
rent_ap_data

Number of offers:                                          int64
Average announced rent in €:                               float64
Average announced rent per squared meter in €:             float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Number of offers,Average announced rent in €,Average announced rent per squared meter in €
Quarter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Beggen,2009,231,1202.00,15.19
Beggen,2010,324,1093.00,15.84
Beggen,2011,196,1284.00,16.42
Beggen,2012,107,1261.00,17.02
Beggen,2013,65,1186.00,19.24
...,...,...,...,...
Weimerskirch,2017,26,1705.54,24.72
Weimerskirch,2018,54,1500.19,26.76
Weimerskirch,2019,87,1415.40,40.99
Weimerskirch,2020,96,1429.38,39.01


The last dataset is a bit different, so we can't handle it we the functions we used before.
We chose also a different organizations for the columns applying somethins similar to what we did before with the index.

In [101]:
reg_price_data = pd.read_excel(reg_price_filepath)
reg_price_data

Unnamed: 0.1,Unnamed: 0,Sales of already constructed apartments,Unnamed: 2,Unnamed: 3,Sales of apartments still under construction (Ventes en Etat Futur D'Achevement [VEFA]),Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Commune,Number of sales,Average registered price per squared meter in €,Price range for price per squared meter,Number of sales,Average registered price per squared meter in €,Price range for price per squared meter,Year
1,Beaufort,3,*,*,0,*,*,2009
2,Bech,0,*,*,0,*,*,2009
3,Beckerich,3,*,*,0,*,*,2009
4,Berdorf,3,*,*,1,*,*,2009
...,...,...,...,...,...,...,...,...
1368,Wiltz,26,4318.5469999999996,2904 € - 6132 €,39,4745.8869999999997,3753 € - 5622 €,2021
1369,Wincrange,5,*,*,4,*,*,2021
1370,Winseler,6,*,*,6,*,*,2021
1371,Wormeldange,18,5798.1880000000001,4003 € - 7510 €,8,*,*,2021


In [102]:
tuples1 = []
# Sales of already constructed apartments
constructed = "Constructed"
# Sales of apartments still under construction (Ventes en Etat Futur D'Achevement [VEFA])
to_be_done = "VEFA"
# Price range for price per squared meter
r_min = "min range"
r_max = "max range"

for el in reg_price_data.iloc[0, 1:3]:
    tuples1.append((constructed, el))
tuples1.append((constructed, r_min))
#tuples1.append((constructed, r_max))

for el in reg_price_data.iloc[0, 1:3]:
    tuples1.append((to_be_done, el))
tuples1.append((to_be_done, r_min))
#tuples1.append((to_be_done, r_max))

print(tuples1)

new_header = reg_price_data.iloc[0]
reg_price_data.columns = new_header
reg_price_data = reg_price_data.iloc[1:]

[('Constructed', 'Number of sales'), ('Constructed', 'Average registered price per squared meter in €'), ('Constructed', 'min range'), ('VEFA', 'Number of sales'), ('VEFA', 'Average registered price per squared meter in €'), ('VEFA', 'min range')]


In [103]:
# ordering the data by Commune and year, creating a multi-index
arrays = [[*reg_price_data.loc[:,'Commune']], [*reg_price_data.loc[:,'Year']]]
tuples = list(zip(*arrays))

index =pd.MultiIndex.from_tuples(tuples, names=['Commune', 'Year'])
reg_price_data.set_index(index, inplace=True)

# we don't need the Commune and year since they are part  of the index
reg_price_data.drop(columns=['Commune', 'Year'], inplace=True)

reg_price_data.sort_index(inplace=True)

new_columns = pd.MultiIndex.from_tuples(tuples1, names=["Construction State", "Detail"])
reg_price_data.columns = new_columns

reg_price_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().sort_index(


Unnamed: 0_level_0,Construction State,Constructed,Constructed,Constructed,VEFA,VEFA,VEFA
Unnamed: 0_level_1,Detail,Number of sales,Average registered price per squared meter in €,min range,Number of sales,Average registered price per squared meter in €,min range
Commune,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Beaufort,2009,3,*,*,0,*,*
Beaufort,2010,2,*,*,0,*,*
Beaufort,2011,4,*,*,0,*,*
Beaufort,2012,8,*,*,0,*,*
Beaufort,2013,1,*,*,0,*,*
...,...,...,...,...,...,...,...
Wormeldange,2017,9,*,*,0,*,*
Wormeldange,2018,11,4120.3959999999997,3136 € - 4943 €,0,*,*
Wormeldange,2019,13,5201.6409999999996,3246 € - 8498 €,13,5693.9480000000003,5014 € - 6660 €
Wormeldange,2020,14,5848.607,4003 € - 7510 €,13,7283.857,5934 € - 8671 €


In [104]:
# function to clean separete correctly the min and max range in Price range for price per squared meter
def organize_range(ind1, ind2, to_insert, col_loc):
    aux = reg_price_data.loc[:,(ind1, ind2)].str.split(" - ", expand=True)
    col1_val = aux.iloc[:,0].map(lambda s: s.replace(" €", "") if s != None else "*")
    col2_val = aux.iloc[:,1].map(lambda s: s.replace(" €", "") if s != None else "*")

    reg_price_data.loc[:,(ind1, ind2)] = col1_val
    reg_price_data.insert(col_loc, (ind1, to_insert), col2_val)

In [105]:
# splitting the constructed range
organize_range(constructed, r_min, r_max, 3)

# splitting the VEFA range
organize_range(to_be_done, r_min, r_max, 7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [106]:
# TODO #6 change type

#reg_price_data.replace("*", "", inplace=True)

#reg_price_data.replace("*", None, inplace=True)
col1 = (constructed, "Number of sales")
col2 = (constructed, "Average registered price per squared meter in €")
col3 = (constructed, r_min)

print(f"{str(col1) + ':':<50} \
    {str(reg_price_data.loc[:,col1].dtype)}")
print(f"{str(col2) + ':':<50} \
    {str(reg_price_data.loc[:,col2].dtype)}")
print(f"{str(col3) + ':':<50} \
    {str(reg_price_data.loc[:,col3].dtype)}")

('Constructed', 'Number of sales'):                    object
('Constructed', 'Average registered price per squared meter in €'):     object
('Constructed', 'min range'):                          object


In [107]:
nat_avg = reg_price_data.loc['National Average']

reg_price_data.drop(index='National Average', inplace=True)

nat_avg

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Construction State,Constructed,Constructed,Constructed,Constructed,VEFA,VEFA,VEFA,VEFA
Detail,Number of sales,Average registered price per squared meter in €,min range,max range,Number of sales,Average registered price per squared meter in €,min range,max range
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2009,,3488.124,2301,4852,,4264.941,2894,6203
2010,,3663.585,2438,5142,,4480.081,3155,6374
2011,,3858.319,2533,5487,,4633.802,3232,6448
2012,,3998.736,2543,5933,,4845.979,3221,6853
2013,,4160.0,2 624,6 176,,5191.0,3 382,7 502
2014,,4323.188,2754,6487,,5283.35,3444,7897
2015,,4470.578,2782,6803,,5589.387,3683,7772
2016,,4795.104,2952,7641,,6050.853,3852,8995
2017,,5102.931,3032,8096,,6351.156,3921,9691
2018,,5576.048,3203,9210,,6331.764,3933,9894


In [109]:
reg_price_data.to_csv(filepath + "/reg.csv")