In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import datetime

from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("../data/cleaned_house_data.csv")

In [3]:
df

Unnamed: 0,energy,heating,price,area,rooms,fee,zipcode,construction_year,level,price_per_area,heating_raw,heating_clean,energy_clean
0,Öl,Zentralheizung,249000,64.00,3,3.57,13505,1971,2,3890.625000,Zentralheizung,zentralheizung,öl
1,Gas,"Fußbodenheizung, offener",1295000,136.06,4,3.57,10405,1900,5,9517.859768,"Fußbodenheizung, offener",fußbodenheizung,gas
2,Gas,Etagenheizung,770000,120.00,4,3.57,12161,1900,5,6416.666667,Etagenheizung,etagenheizung,gas
3,Gas,Zentralheizung,349000,158.00,5,3.57,12103,1956,1,2208.860759,Zentralheizung,zentralheizung,gas
4,Gas,Etagenheizung,494990,126.00,4,3.57,12157,1909,3,3928.492063,Etagenheizung,etagenheizung,gas
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4319,Fernwärme,"Fußbodenheizung,",429000,78.00,3,3.57,13595,2000,1,5500.000000,"Fußbodenheizung,",fußbodenheizung,fernwärme
4320,Gas,Zentralheizung,729000,150.00,4,3.57,13467,1990,1,4860.000000,Zentralheizung,zentralheizung,gas
4321,Gas,Etagenheizung,695600,94.00,3,3.57,10115,1900,1,7400.000000,Etagenheizung,etagenheizung,gas
4322,Fernwärme,Minergie zertifiziert,839251,107.00,3,3.57,13591,1894,1,7843.467290,Minergie zertifiziert,minergie,fernwärme


In [4]:
# House age

current_year = datetime.datetime.now().year
df['house_age'] = current_year - df['construction_year']

In [5]:
# price per area

df['price_per_area'] = df['price'] / df['area']

In [6]:
# rooms_per_area

df['rooms_per_area'] = df['rooms'] / df['area']

In [7]:
# one-hot encoding categorical columns

categorical_cols = ['heating_clean', 'energy_clean', 'level']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [8]:
print(df.columns)

Index(['energy', 'heating', 'price', 'area', 'rooms', 'fee', 'zipcode',
       'construction_year', 'price_per_area', 'heating_raw', 'house_age',
       'rooms_per_area', 'heating_clean_fußbodenheizung',
       'heating_clean_minergie', 'heating_clean_missing',
       'heating_clean_other', 'heating_clean_wärmepumpe',
       'heating_clean_zentralheizung', 'energy_clean_gas',
       'energy_clean_missing', 'energy_clean_other', 'energy_clean_öl',
       'level_2', 'level_3', 'level_4', 'level_5', 'level_6', 'level_7',
       'level_8', 'level_9', 'level_10', 'level_11', 'level_12', 'level_13',
       'level_14', 'level_15', 'level_17', 'level_21', 'level_23', 'level_24',
       'level_2023'],
      dtype='object')


In [9]:
# Log-transform skewed numeric columns

df['log_price'] = np.log1p(df['price'])
df['log_area'] = np.log1p(df['area'])
df['log_fee'] = np.log1p(df['fee'].replace(0, np.nan))

In [10]:
# Binning house_age or rooms_per_area

df['age_bin'] = pd.cut(df['house_age'], bins=[0,10,30,50,100], labels=['new','recent','old','very_old'])
df['rooms_per_area_bin'] = pd.qcut(df['rooms_per_area'], q=4, labels=False)

In [29]:
# Calculate frequency of each zipcode
zipcode_freq = df['zipcode'].value_counts(normalize=True)

# Map frequencies back to the dataframe
df['zipcode_freq'] = df['zipcode'].map(zipcode_freq)

In [31]:
print(df.columns)

Index(['energy', 'heating', 'price', 'area', 'rooms', 'fee', 'zipcode',
       'construction_year', 'price_per_area', 'heating_raw', 'house_age',
       'rooms_per_area', 'heating_clean_fußbodenheizung',
       'heating_clean_minergie', 'heating_clean_missing',
       'heating_clean_other', 'heating_clean_wärmepumpe',
       'heating_clean_zentralheizung', 'energy_clean_gas',
       'energy_clean_missing', 'energy_clean_other', 'energy_clean_öl',
       'level_2', 'level_3', 'level_4', 'level_5', 'level_6', 'level_7',
       'level_8', 'level_9', 'level_10', 'level_11', 'level_12', 'level_13',
       'level_14', 'level_15', 'level_17', 'level_21', 'level_23', 'level_24',
       'level_2023', 'log_price', 'log_area', 'log_fee', 'age_bin',
       'rooms_per_area_bin', 'zipcode_freq'],
      dtype='object')


In [33]:
df.to_csv('FE_housing.csv', index=False)