In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport

In [None]:
pd.set_option('display.max_columns', None) # show all columns in a df

In [None]:
X_val = pd.read_csv("split_data/val_features.csv")
y_val = pd.read_csv("split_data/val_target.csv")

In [None]:
df_val = pd.concat([X_val, y_val], axis = 1)

In [None]:
df_val.shape

In [None]:
# df_val['row_prop_missing'] = df_val.isna().mean(axis=1) # Not ok to already calculate here - some are missing by design (e.g. sticker)

In [None]:
df_val.head()

In [None]:
profile = ProfileReport(df_val)

In [None]:
profile.to_notebook_iframe()

## Rows with many NAs

In [None]:
# display(df_val[df_val['row_prop_missing'] > .10])

## Duplicate rows

## Subtype

Observations / remarks:
- Missing values are partly houses and partly appartments -> can be assigned accordingly
- There are synonyms -> can be grouped together
- 'Andere' is never an appartment
- Group infrequent levels together? Reduce dimensionality (but: not really necessary for DT-based approach?)
- Perform clustering a.o. on price to reduce number of levels?

In [None]:
pd.crosstab(df_val['subtype'], df_val['is_appartment'], dropna=False)

In [None]:
with open('intermediate_data/map_infrequent_subtypes.pkl', 'rb') as file:
    map_infrequent_subtypes = pickle.load(file)

df_val['subtype_regrouped'] = df_val['subtype'].apply(lambda x: 'Andere' if x in map_infrequent_subtypes else x) # remove this step?

In [None]:
map_synonyms = {
    'Assistentie-appartement': 'Serviceflat',
    'Villa-landhuis': 'Villa',
    'Moderne villa': 'Villa',
    'Eengezinswoning': 'Woning',    
    # 'Herenwoning': 'Herenhuis',
    # 'Dakappartement': 'Penthouse',
    'Studio met slaaphoek': 'Studio',

    # 'Rijwoning': 'Woning',
    'Gelijkvloers app.': 'Appartement',
    'Uitzonderlijke woning': 'Villa',
    'Herenwoning': 'Villa',
    'Herenhuis': 'Villa',
    'Burgerswoning': 'Woning',
    'Koppelwoning': 'Woning',
    'Duplex': 'Appartement',
    'Triplex': 'Appartement',
    'Bungalow': 'Woning',
    'Hoeve': 'Villa',
    'Fermette': 'Woning',
    'Bel-étage': 'Woning',
    'Hoekwoning': 'Woning',
    'Pastorijwoning': 'Woning',
    'Arbeiderswoning': 'Woning',
    'Loft': 'Loft Penthouse',
    'Dakappartement': 'Appartement',
    'Penthouse': 'Loft Penthouse',
    'Chalet': 'Andere',
    'Cottage': 'Andere',
    'Vakantiewoning': 'Andere',
    'Gemengd gebruik': 'Andere',
    'Woonboot': 'Andere' # To do: implement solution for new building types not seen in training data -> should become Other
    
    
}

df_val['subtype_regrouped'] = df_val['subtype_regrouped'].replace(map_synonyms)
df_val['subtype_regrouped'] = df_val['subtype_regrouped'].fillna('Andere')

df_val['subtype_regrouped'].value_counts()

In [None]:
with open('intermediate_data/subtype_median_price.pkl', 'rb') as file:
    subtype_median_price = pickle.load(file)

In [None]:
df_val = pd.merge(df_val, subtype_median_price, how = 'left', on = 'subtype_regrouped')
display(df_val)

## Area

Impute area with median value for subtype (regrouped) and province

In [None]:
df_val['area_missing'] = df_val['area'].isna().astype(int)

In [None]:
with open('intermediate_data/median_area.pkl', 'rb') as file:
    median_area = pickle.load(file)

with open('intermediate_data/median_area_bedrooms.pkl', 'rb') as file:
    median_area_bedrooms = pickle.load(file)

In [None]:
df_val['area_imputed1'] = df_val.apply(
    lambda row: median_area_bedrooms.get((row['subtype_regrouped'], row['bedrooms']), row['area']) if pd.isna(row['area']) else row['area'],
    axis=1
)
df_val['area_imputed2'] = df_val.apply(
    lambda row: median_area.get((row['subtype_regrouped']), row['area']) if pd.isna(row['area']) else row['area'],
    axis=1
)

df_val['area_imputed'] = df_val['area_imputed1'].combine_first(df_val['area_imputed2'])

df_val.drop(['area_imputed1', 'area_imputed2'], axis = 1)

In [None]:
df_val['area_rel_to_bedrooms'] = df_val['area_imputed'] / (df_val['bedrooms'] + 1)

## Energy value

In [None]:
# Keep only first letter of energy label, except for 'a+'

df_val['energy_label_regrouped'] = df_val['energy_label'].apply(lambda x: x[0] if isinstance(x, str) and x != 'a+' and x != 'a+' else x)

In [None]:
pd.crosstab(df_val['energy_label_regrouped'], df_val['new_building'], dropna=False)

For f and g not ok: normally label A corresponds to values 0-100, B to 101-200, etc.

In [None]:
df_val['energy_value_missing'] = df_val['energy_value'].isna().astype(int)

In [None]:
with open('intermediate_data/median_energy.pkl', 'rb') as file:
    median_energy = pickle.load(file)

df_val['energy_value_imputed'] = df_val.apply(
    lambda row: median_energy.get((row['energy_label_regrouped'], row['new_building'], row['subtype_regrouped']), row['energy_value']) if pd.isna(row['energy_value']) else row['energy_value'],
    axis=1
)

In [None]:
# Now all rows except for the ones where the energy label is also missing have an energy value

df_val['energy_value_imputed'].value_counts(dropna = False)

In [None]:
df_val.groupby(['subtype_regrouped', 'new_building'])['energy_value'].median()

In [None]:
with open('intermediate_data/median_energy_wo_label.pkl', 'rb') as file:
    median_energy_wo_label = pickle.load(file)

df_val['energy_value_imputed'] = df_val.apply(
    lambda row: median_energy_wo_label.get((row['new_building'], row['subtype_regrouped']), row['energy_value']) if pd.isna(row['energy_value_imputed']) else row['energy_value_imputed'],
    axis=1
)

In [None]:
# All NAs imputed

df_val['energy_value_imputed'].value_counts(dropna = False)

## Advertiser

High number of categories - apply something similar to Weights Of Evidence (= for classification problems)

Note: some advertisers occur only once and seem to have a person's name (not a real estate agency) -> also informative

Make bins of advertisers based on how often they occur (only once (category 5: person) vs. more (categories 1-4: agency)) and their median price.
Category 5: only occurs once, so no relevant information on 'typical' pricing; this advertiser will (normally) not occur in the test set either 
Catgories 1-4: occurs more than once, categorize based on median price 

In [None]:
df_val['advertiser'].value_counts(dropna = False).head(50) 

In [None]:
with open('intermediate_data/median_price_advertiser.pkl', 'rb') as file:
    median_price_advertiser = pickle.load(file)

In [None]:
df_val = pd.merge(df_val, median_price_advertiser, how='left', on='advertiser')
display(df_val)

In [None]:
df_val['advertiser_count'] = df_val['advertiser_count'].fillna(1)

In [None]:
df_val['advertiser_bin'].value_counts(dropna = False)

In [None]:
df_val['advertiser_bin'].value_counts(dropna = False)

In [None]:
df_val['advertiser_bin'] = np.where(pd.isna(df_val['advertiser_bin']), 5, df_val['advertiser_bin'])

In [None]:
df_val['advertiser_bin'].value_counts(dropna = False)

In [None]:
with open('intermediate_data/median_price_per_advertiser_bin.pkl', 'rb') as file:
    median_price_per_advertiser_bin = pickle.load(file)

In [None]:
df_val = pd.merge(df_val, median_price_per_advertiser_bin, how='left', on='advertiser_bin')
df_val.head()

## Regional prices - Statbel

In [None]:
with open('intermediate_data/map_nis_deduplicated.pkl', 'rb') as file:
    df_map_nis_deduplicated = pickle.load(file)

In [None]:
df_val = pd.merge(df_val, df_map_nis_deduplicated, how='left', left_on='postcode', right_on = 'zip_code')
display(df_val)

In [None]:
with open('intermediate_data/statbel_prices_selection.pkl', 'rb') as file:
    df_statbel_prices_selection = pickle.load(file)

In [None]:
df_val = pd.merge(df_val, df_statbel_prices_selection, how='left', left_on=['nis_code', 'is_appartment'], right_on = ['CD_REFNIS', 'F_APPARTMENT'])

In [None]:
display(df_val)

In [None]:
# Still quite some missings, whereas profiling below suggests it's a highly relevant feature

df_val['MS_P_50_median'].isna().sum()

In [None]:
with open('intermediate_data/price_lookup_zipcode.pkl', 'rb') as file:
    price_lookup_zipcode = pickle.load(file)

In [None]:
df_val["zip_code_first2"] = df_val.postcode.astype(str).str[:2]

df_val["MS_P_50_median_imputed"] = df_val.apply(
    lambda row: price_lookup_zipcode.get(
        (row["zip_code_first2"], row["is_appartment"]),
        row["MS_P_50_median"]
    ) if pd.isna(row["MS_P_50_median"]) else row["MS_P_50_median"],
    axis=1
)

In [None]:
with open('intermediate_data/price_lookup_province.pkl', 'rb') as file:
    price_lookup_province = pickle.load(file)

In [None]:
df_val["zip_code_first1"] = df_val.postcode.astype(str).str[:1]

df_val["MS_P_50_median_imputed"] = df_val.apply(
    lambda row: price_lookup_province.get(
        (row["zip_code_first1"], row["is_appartment"]),
        row["MS_P_50_median"]
    ) if pd.isna(row["MS_P_50_median_imputed"]) else row["MS_P_50_median_imputed"],
    axis=1
)

In [None]:
df_val[df_val['MS_P_50_median_imputed'].isna()]

## Regional prices - Price per area 

In [None]:
with open('intermediate_data/price_per_area_per_region.pkl', 'rb') as file:
    price_per_area_per_region = pickle.load(file)

In [None]:
with open('intermediate_data/price_per_area_per_region_2.pkl', 'rb') as file:
    price_per_area_per_region_2 = pickle.load(file)

In [None]:
df_val = pd.merge(df_val, price_per_area_per_region, how = 'left', on = 'zip_code_first2')

df_val["zip_code_first3"] = df_val.postcode.astype(str).str[:3]
df_val = pd.merge(df_val, price_per_area_per_region_2, how = 'left', on = 'zip_code_first3')

## Regional prices - replace Statbel

In [None]:
with open('intermediate_data/price_per_area_type_zipcode.pkl', 'rb') as file:
    price_per_area_type_zipcode = pickle.load(file)

with open('intermediate_data/price_per_area_type_zipcodefirst3.pkl', 'rb') as file:
    price_per_area_type_zipcodefirst3 = pickle.load(file)

with open('intermediate_data/price_per_area_type_zipcodefirst2.pkl', 'rb') as file:
    price_per_area_type_zipcodefirst2 = pickle.load(file)

with open('intermediate_data/price_per_area_type_zipcodefirst1.pkl', 'rb') as file:
    price_per_area_type_zipcodefirst1 = pickle.load(file)

In [None]:
df_val["postcode_first3"] = df_val.postcode.astype(str).str[:3]
df_val["postcode_first2"] = df_val.postcode.astype(str).str[:2]
df_val["postcode_first1"] = df_val.postcode.astype(str).str[:1]

df_val = pd.merge(df_val, price_per_area_type_zipcode, how = 'left', on = ['postcode', 'is_appartment'])
df_val = pd.merge(df_val, price_per_area_type_zipcodefirst3, how = 'left', on = ['postcode_first3', 'is_appartment'])
df_val = pd.merge(df_val, price_per_area_type_zipcodefirst2, how = 'left', on = ['postcode_first2', 'is_appartment'])
df_val = pd.merge(df_val, price_per_area_type_zipcodefirst1, how = 'left', on = ['postcode_first1', 'is_appartment'])
df_val['price_per_area_type_region'] = df_val['price_per_area_type_zipcode'].\
combine_first(df_val['price_per_area_type_zipcodefirst3']).\
combine_first(df_val['price_per_area_type_zipcodefirst2']).\
combine_first(df_val['price_per_area_type_zipcodefirst1'])
display(df_val[df_val['price_per_area_type_zipcode'].isna()])

## Lat / Lon

In [None]:
df_val['lat_missing'] = df_val['lat'].isna().astype(int)
df_val['lon_missing'] = df_val['lon'].isna().astype(int)

In [None]:
with open('intermediate_data/mean_lat.pkl', 'rb') as file:
    mean_lat = pickle.load(file)
with open('intermediate_data/mean_lon.pkl', 'rb') as file:
    mean_lon = pickle.load(file)

In [None]:
df_val['lat_imputed'] = df_val.apply(
    lambda row: mean_lat.get((row['province']), row['lat']) if pd.isna(row['lat']) else row['lat'],
    axis=1
)

df_val['lon_imputed'] = df_val.apply(
    lambda row: mean_lon.get((row['province']), row['lon']) if pd.isna(row['lon']) else row['lon'],
    axis=1
)

## Price drop flag

In [None]:
df_val['price_dropped'] = abs(df_val['price_drop_date'].isna().astype(int) - 1)
display(df_val)

## Zipcode last digits

In [None]:
df_val["postcode_last3"] = df_val.postcode.astype(str).str[1:] 
df_val["postcode_last2"] = df_val.postcode.astype(str).str[2:]
df_val["postcode_last3_0"] = (df_val["postcode_last3"] == '000').astype('int')
df_val["postcode_last2_0"] = ((df_val["postcode_last3"] != '000') & (df_val["postcode_last2"] == '00')).astype('int')

## Profile again

In [None]:
# Leave out percentiles 25 and 75 for prices per NIS code because of multicollinearity
# Leave out row count because it's not available when making a single prediction and has low predictive power (it's usually just 1)

# df_val_sel = df_val[['bedrooms', 'new_building', 'foto_amount', 'province', 'subtype_regrouped', 'area_missing', 'area_imputed', 'area_rel_to_bedrooms', 'energy_value_missing', 'energy_value_imputed', 'advertiser_count', 'median_price_advertiser_bin', 'MS_TOTAL_TRANSACTIONS', 'MS_P_50_median_imputed', 'price']].drop_duplicates()

In [None]:
df_val_sel = df_val[['new_building', 'foto_amount', 'province', 'subtype_median_price', 'area_missing', 'area_imputed', 'energy_value_missing', 'energy_value_imputed', 'advertiser_count', 'median_price_advertiser_bin', 'MS_P_50_median_imputed', 'median_price_per_area', 'price']].drop_duplicates()

In [None]:
profile_sel = ProfileReport(df_val_sel)

In [None]:
profile_sel.to_notebook_iframe()

## Final variable selection and encoding

In [None]:
display(df_val_sel)

In [None]:
# df_val_sel = df_val_sel[df_val_sel['price'] < 500000]

In [None]:
# X_val_preprocessed = df_val_sel.drop('price', axis = 1)
# X_val_preprocessed = pd.get_dummies(X_val_preprocessed, columns=['subtype_regrouped', 'province'], drop_first=True)

In [None]:
X_val_preprocessed = df_val_sel.drop('price', axis = 1)
X_val_preprocessed = pd.get_dummies(X_val_preprocessed, columns=['province'], drop_first=True)

In [None]:
display(X_val_preprocessed)

In [None]:
y_val_preprocessed = df_val_sel['price']

In [None]:
X_calib_preprocessed, X_val_preprocessed, y_calib_preprocessed, y_val_preprocessed = train_test_split(X_val_preprocessed, y_val_preprocessed, test_size=.5, random_state=16)

In [None]:
X_calib_preprocessed.shape

In [None]:
y_calib_preprocessed.shape

In [None]:
X_val_preprocessed.shape

In [None]:
y_val_preprocessed.shape

In [None]:
X_calib_preprocessed.to_csv("split_data/calib_features_preprocessed.csv", index=False)
y_calib_preprocessed.to_csv("split_data/calib_target_preprocessed.csv", index=False)

X_val_preprocessed.to_csv("split_data/val_features_preprocessed.csv", index=False)
y_val_preprocessed.to_csv("split_data/val_target_preprocessed.csv", index=False)

## Final variable selection and encoding - lat/lon instead of province

In [None]:
df_val_sel_2 = df_val[['new_building', 'foto_amount', 'lat_missing', 'lat_imputed', 'lon_missing', 'lon_imputed', 'subtype_median_price', 'area_missing', 'area_imputed', 'energy_value_missing', 'energy_value_imputed', 'advertiser_count', 'median_price_advertiser_bin', 'MS_P_50_median_imputed', 'median_price_per_area', 'price']].drop_duplicates()

In [None]:
X_val_preprocessed_2 = df_val_sel_2.drop('price', axis = 1)

In [None]:
y_val_preprocessed_2 = df_val_sel_2['price']

In [None]:
X_calib_preprocessed_2, X_val_preprocessed_2, y_calib_preprocessed_2, y_val_preprocessed_2 = train_test_split(X_val_preprocessed_2, y_val_preprocessed_2, test_size=.5, random_state=16)

In [None]:
X_calib_preprocessed_2.to_csv("split_data/calib_features_preprocessed_2.csv", index=False)
y_calib_preprocessed_2.to_csv("split_data/calib_target_preprocessed_2.csv", index=False)

X_val_preprocessed_2.to_csv("split_data/val_features_preprocessed_2.csv", index=False)
y_val_preprocessed_2.to_csv("split_data/val_target_preprocessed_2.csv", index=False)

## Final variable selection and encoding - attempt 3

In [None]:
df_val_sel_3 = df_val[['new_building', 'foto_amount', 'lat_missing', 'lat_imputed', 'lon_missing', 'lon_imputed', 'subtype_median_price', 'area_missing', 'area_imputed', 'energy_value_missing', 'energy_value_imputed', 'advertiser_count', 'median_price_advertiser_bin', 'price_per_area_type_region', 'median_price_per_area', 'price_dropped', 'price']].drop_duplicates()

In [None]:
X_val_preprocessed_3 = df_val_sel_3.drop('price', axis = 1)
y_val_preprocessed_3 = df_val_sel_3['price']

X_val_preprocessed_3.to_csv("split_data/val_features_preprocessed_3.csv", index=False)
y_val_preprocessed_3.to_csv("split_data/val_target_preprocessed_3.csv", index=False)

In [None]:
X_calib_preprocessed_3, X_val_preprocessed_3b, y_calib_preprocessed_3, y_val_preprocessed_3b = train_test_split(X_val_preprocessed_3, y_val_preprocessed_3, test_size=.5, random_state=16)

In [None]:
X_calib_preprocessed_3.to_csv("split_data/calib_features_preprocessed_3.csv", index=False)
y_calib_preprocessed_3.to_csv("split_data/calib_target_preprocessed_3.csv", index=False)

## Final variable selection and encoding - attempt 3b

In [None]:
df_val_sel_3b = df_val[['new_building', 'foto_amount', 'lat_missing', 'lat_imputed', 'lon_imputed', 'subtype_median_price', 'area_missing', 'area_imputed', 'energy_value_missing', 'energy_value_imputed', 'advertiser_count', 'median_price_advertiser_bin', 'MS_P_50_median_imputed', 'median_price_per_area_2', 'postcode_last2_0', 'postcode_last3_0', 'price_dropped', 'price']]

In [None]:
X_val_preprocessed_3b = df_val_sel_3b.drop('price', axis = 1)
y_val_preprocessed_3b = df_val_sel_3b['price']

X_val_preprocessed_3b.to_csv("split_data/val_features_preprocessed_3b.csv", index=False)
y_val_preprocessed_3b.to_csv("split_data/val_target_preprocessed_3b.csv", index=False)

In [None]:
X_calib_preprocessed_3b, X_val_preprocessed_3b, y_calib_preprocessed_3b, y_val_preprocessed_3b = train_test_split(X_val_preprocessed_3b, y_val_preprocessed_3b, test_size=.5, random_state=16)

In [None]:
X_calib_preprocessed_3b.to_csv("split_data/calib_features_preprocessed_3b.csv", index=False)
y_calib_preprocessed_3b.to_csv("split_data/calib_target_preprocessed_3b.csv", index=False)