In [1]:
import pandas as pd
import numpy as np

In [2]:
def print_shape(func):
    def wrapper(*args, **kwargs):
        print(f"Before: {args[0].shape}")
        func(*args, **kwargs)
        print(f"After: {args[0].shape}")
    return wrapper

@print_shape
def drop_duplicates(df):
    return df.drop_duplicates(inplace=True)

In [3]:
df = pd.read_csv("immo-eliza-goats/Data/cleaned/clean_set.csv")

In [4]:
# drop unnecessary columns
df.drop(columns=["Unnamed: 0", "price_square"], inplace=True)

# remove duplicates
drop_duplicates(df)  # helper function with decorator around df.drop_duplicates(inplace=True)

# remove properties with no known property_type
df = df[df["property_type"].notna()]

# remove properties with no known price or a too low price
df = df[(df["price"] > 75000) & (df["price"].notna())]

# rename a couple of columns
df.rename(columns={
    "floodZoneType": "fl_floodzone",
    "primaryEnergyConsumptionPerSqm": "primary_energy_consumption_sqm",
    "total_area_m2": "total_area_sqm",
    "furnished": "fl_furnished",
    "open_fire": "fl_open_fire",
    "terrace": "fl_terrace",
    "garden": "fl_garden",
    "surface_land": "surface_land_sqm",
    "swimming_pool": "fl_swimming_pool",
    "Double_Glazing": "fl_double_glazing",
    "Number_of_frontages": "nbr_frontages",
    "bedroom_count": "nbr_bedrooms"
}, inplace=True)

# set all columns to lower case
df.columns = df.columns.str.lower()

# remap some columns to their respective categories
epc_mapping = {"A++": 9, "A+": 8, "A": 7, "B": 6, "C": 5, "D": 4, "E": 3, "F": 2, "G": 1}
state_mapping = {"JUST_RENOVATED": 6, "AS_NEW": 5, "GOOD": 4,
                 "TO_BE_DONE_UP": 3, "TO_RENOVATE": 2, "TO_RESTORE": 1}
property_type ={"APARTMENT": 1, "HOUSE": 0}

df.replace({
    "epc": {v: k for k, v in epc_mapping.items()},
    "state_building": {v: k for k, v in state_mapping.items()},
    "property_type": {v: k for k, v in property_type.items()}
}, inplace=True)

# replace nan with "MISSING" for object columns
for col in df.select_dtypes(include="object").columns:
    df[col].fillna("MISSING", inplace=True)

# replace 0 with nan for cadastral_income
df["cadastral_income"] = np.where(df["cadastral_income"] == 0, np.nan, df["cadastral_income"])

# replace impossible construction year values with nan
df["construction_year"] = np.where((df["construction_year"] < 1750) |
                                   (df["construction_year"] > 2024), np.nan, df["construction_year"])

# add a random property id and resample
df["id"] = np.random.randint(0, len(df), size=len(df)) * 1000
df = df.sample(frac=1)
df.reset_index(drop=True, inplace=True)

# reorder columns
cols = [
    "id",
    "price",
    "property_type",
    "subproperty_type",
    "region", "province", "locality", "zip_code", "latitude", "longitude",
    "construction_year",
    "total_area_sqm", "surface_land_sqm",
    "nbr_frontages",
    "nbr_bedrooms",
    "equipped_kitchen",
    "fl_furnished",
    "fl_open_fire", 
    "fl_terrace", "terrace_sqm",
    "fl_garden", "garden_sqm",
    "fl_swimming_pool",
    "fl_floodzone",
    "state_building",
    "primary_energy_consumption_sqm", "epc", "heating_type",
    "fl_double_glazing",
    "cadastral_income"
]
df = df[cols]

Before: (85528, 29)
After: (80404, 29)
