In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('raw_data.csv')

In [2]:
dfc = pd.read_csv('cleaned-data.csv')

In [3]:
# Dropping entries with missing values in "price" (36)
dfc = df.dropna(subset=['price'], how='all')
dfc.reset_index(drop=True, inplace=True)                              # Added "drop=True" to drop the old index

In [4]:
# Drop rows where 'living_area' is 0 (5 entries)
dfc = dfc[dfc["living_area"] != 0]

In [5]:
# Handling missing values in "facade_number" (9362)

# Compute median facade number by subtype
facade_dict = dfc.groupby("subtype_of_property")["facade_number"].median().to_dict()

# Impute values
dfc["facade_number"] = dfc["facade_number"].fillna(dfc["subtype_of_property"].map(facade_dict))

In [6]:
#cleaning and normalizing kitchen status
dfc.loc[dfc['equipped_kitchen'] == '0', ['equipped_kitchen']] = 'not installed'
dfc.loc[dfc['equipped_kitchen'] == 'usa uninstalled', ['equipped_kitchen']] = 'not installed'
dfc.loc[(dfc['equipped_kitchen'] != 'installed') & (dfc['equipped_kitchen'] != 'not installed'), ['equipped_kitchen']] = 'equipped'

In [7]:
#cleaning and normalizing building condition
dfc.loc[(dfc['building_condition'] == 'as new') | (dfc['building_condition'] == 'just renovated'), ['building_condition']] = 'good'
dfc.loc[dfc['building_condition'] == 'to be done up', ['building_condition']] = 'to renovate'
dfc.loc[dfc['building_condition'].isna(), ['building_condition']] = 'no info'

In [8]:
# Create function to remove 'unit' form sub type of property, so 'apartment' and 'apartment unit' become one category and so on
def remove_unit(subtype_of_property):
    if 'unit' in subtype_of_property:
        subtype_of_property = subtype_of_property.replace(' unit', '')
    return subtype_of_property

In [9]:
# Apply the function to remove 'unit' from the sub type col:
dfc['subtype_of_property'] = dfc['subtype_of_property'].apply(remove_unit)

In [None]:
# Updating a small amount of missatributed apartment type
dfc.loc[(dfc['sub_property_group_encoded'] == "houses") & (dfc['type_of_property'] == 0), ['type_of_property']] = 1

Grouping Similar Subtypes into Categories

Example Grouping:
* 1 (Luxury Properties): 'mansion', 'castle', 'exceptional property', 'villa'
* 2 (Houses): 'house', 'bungalow', 'town house', 'manor house', 'duplex', 'triplex'
* 3 (Apartments): 'apartment', 'flat studio', 'penthouse', 'ground floor', 'loft', 'kot'
* 4 (Rural): 'country cottage', 'farmhouse', 'chalet'
* 5 (Other): 'other property', 'mixed use building', 'service flat', 'apartment block'

In [11]:
# Define grouped mapping with assigned numbers
property_group_mapping = {
    'apartment': 'apartments', 'flat studio': 'apartments', 'penthouse': 'apartments', 'ground floor': 'apartments', 'loft': 'apartments',
    'house': 'houses', 'villa': 'luxury_properties', 'bungalow': 'houses', 'town house': 'houses', 'manor house': 'luxury_properties',
    'mansion': 'luxury_properties', 'castle': 'luxury_properties', 'exceptional property': 'luxury_properties',
    'mixed use building': 'other', 'service flat': 'other',
    'kot': 'apartments', 'triplex': 'houses', 'country cottage': 'rural', 'farmhouse': 'rural', 'chalet': 'rural', 'other property': 'other', 'duplex': 'houses','apartment block': 'other'
}

# Apply the grouped mapping to the column
dfc['sub_property_group_encoded'] = dfc['subtype_of_property'].map(property_group_mapping)

In [6]:
dfc.to_csv('cleaned-data.csv', index=False)