In [None]:
import pandas as pd
import pickle
from ydata_profiling import ProfileReport

In [None]:
pd.set_option('display.max_columns', None) # show all columns in a df

In [None]:
X_train = pd.read_csv("split_data/train_features.csv")
y_train = pd.read_csv("split_data/train_target.csv")

In [None]:
df_train = pd.concat([X_train, y_train], axis = 1)

In [None]:
# df_train['row_prop_missing'] = df_train.isna().mean(axis=1) # Not ok to already calculate here - some are missing by design (e.g. sticker)

In [None]:
profile = ProfileReport(df_train)

In [None]:
profile.to_notebook_iframe()

Observations  / remarks:
- id: make sure not to include
- is_appartment: multicollinearity with subtype -> don't include both
- area: high correlation with price! impute missing values based on median per subtype, add column to indicate original was missing
- added_time: number of minutes / seconds since the property was added? Can be relevant (very expensive properties might take longer before being sold)
- bedrooms: high correlation with area -> price!
- new_building: no remarks
- postcode: no remarks
- lat / lon: impute missings? calculate distance to railway track, airport, highway (noise)?
- advertiser: impute missings or treat as separate category? > 2000 unique values: reduce? 
- foto_amount: no remarks
- is_promoted: constant -> ignore
- subtype: limited number of missings, can be partly imputed based on is_appartment
- sticker (new / price drop): only price drop could be relevant for price? recode missing values to third category 'no sticker'? ! perfect correlation with energy_value?
- price drop date: relevance?
- energy_value: many missings! can be partly imputed using new_building and/or energy_label? note: a missing energy value usually means it's a bad one
- energy_label: many missings! multicollinearity with energy_value - value is more precise 
- province: maybe cross-check with postal code (DQ); use statbel average price per house / appartment data?
- price: maximum 999999?
- Outliers: Treat or not? DT-based methods can handle them

## Rows with many NAs

In [None]:
# display(df_train[df_train['row_prop_missing'] > .10])

## Subtype

Observations / remarks:
- Missing values are partly houses and partly appartments -> can be assigned accordingly
- There are synonyms -> can be grouped together
- 'Andere' is never an appartment
- Group infrequent levels together? Reduce dimensionality (but: not really necessary for DT-based approach?)

In [None]:
pd.crosstab(df_train['subtype'], df_train['is_appartment'], dropna=False)

In [None]:
subtype_counts = df_train['subtype'].value_counts()
min_subtype_count = 10
map_infrequent_subtypes = subtype_counts[subtype_counts < min_subtype_count].index
df_train['subtype_regrouped'] = df_train['subtype'].apply(lambda x: 'Andere' if x in map_infrequent_subtypes else x)

In [None]:
map_synonyms = {
    'Assistentie-appartement': 'Serviceflat',
    'Villa-landhuis': 'Villa',
    'Moderne villa': 'Villa',
    'Eengezinswoning': 'Woning',    
    'Herenwoning': 'Herenhuis',
    'Dakappartement': 'Penthouse',
    'Studio met slaaphoek': 'Studio',
}

df_train['subtype_regrouped'] = df_train['subtype_regrouped'].replace(map_synonyms)

df_train['subtype_regrouped'].value_counts()

## Area

Impute area with median value for subtype (regrouped) and province

In [None]:
df_train['area_missing'] = df_train['area'].isna().astype(int)

In [None]:
median_area = df_train.groupby(['subtype_regrouped', 'province'])['area'].median()

median_area.to_pickle('intermediate_data/median_area.pkl')

In [None]:
df_train['area_imputed'] = df_train.apply(
    lambda row: median_area.get((row['subtype_regrouped'], row['province']), row['area']) if pd.isna(row['area']) else row['area'],
    axis=1
)

In [None]:
# df_train['area_imputed'] = df_train['area'].fillna(df_train.groupby(['subtype_regrouped', 'province'])['area'].transform('median'))

In [None]:
display(df_train[0:10])

## Energy value

In [None]:
df_train['energy_label'].value_counts(dropna = False)

New buildings with energy label d, e, f, g -> probably bad DQ?

In [None]:
pd.crosstab(df_train['energy_label'], df_train['new_building'], dropna=False)

Not ok: normally label A corresponds to values 0-100, B to 101-200, etc.

In [None]:
df_train['energy_label_regrouped'] = df_train['energy_label'].apply(lambda x: x[0] if isinstance(x, str) and x != 'a+' and x != 'a+' else x)

In [None]:
median_energy = df_train.groupby(['energy_label_regrouped', 'new_building'])['energy_value'].median()

median_energy.to_pickle('intermediate_data/median_energy.pkl')

In [None]:
display(median_energy)

In [None]:
df_train[df_train['energy_value'].isna()]

In [None]:
pd.crosstab(df_train[df_train['energy_value'].isna()]['energy_label_regrouped'], df_train[df_train['energy_value'].isna()]['new_building'], dropna=False)

In [None]:
df_train['energy_value_missing'] = df_train['energy_value'].isna().astype(int)

In [None]:
df_train['energy_value_imputed'] = df_train.apply(
    lambda row: median_energy.get((row['energy_label_regrouped'], row['new_building']), row['energy_value']) if pd.isna(row['energy_value']) else row['energy_value'],
    axis=1
)

In [None]:
df_train['energy_value'].value_counts(dropna = False)

In [None]:
df_train['energy_value_imputed'].value_counts(dropna = False)

## Advertiser

High number of categories - leave out at first? 

Note: some advertisers occur only once and seem to have a person's name (not a real estate agency) -> also informative

In [None]:
df_train['advertiser'].value_counts(dropna = False).head(50) # Include these counts as a feature (proxy of how big the agency is)

In [None]:
advertiser_counts = df_train['advertiser'].value_counts()
min_advertiser_count = 10
map_infrequent_advertisers = advertiser_counts[advertiser_counts < min_advertiser_count].index
df_train['advertiser_regrouped'] = df_train['advertiser'].apply(lambda x: 'Andere' if x in map_infrequent_advertisers or pd.isna(x) else x)

In [None]:
df_train['advertiser_regrouped'].value_counts(dropna = False)

In [None]:
df_train['advertiser'].nunique()

In [None]:
df_train['advertiser_regrouped'].nunique()

## Regional prices

In [None]:
df_map_nis = pd.read_csv("external_data/cities.csv")

In [None]:
display(df_map_nis)

In [None]:
df_map_nis_renamed = df_map_nis.rename(columns={'name': 'municipality', 'province': 'province_nis', 'zipCode': 'zip_code', 'nisCode': 'nis_code'}).drop(columns = 'main')

In [None]:
df_train = pd.merge(df_train, df_map_nis_renamed, how='left', left_on='postcode', right_on = 'zip_code')

In [None]:
display(df_train)

In [None]:
pd.crosstab(df_train['province'], df_train['province_nis'], dropna=False)

In [None]:
df_statbel_houseprices = pd.read_excel("external_data/vastgoed_2010_9999.xlsx")