In [1]:
import re
import os
import scipy.stats
import numpy as np
import pandas as pd
from tests import *
from matplotlib import pyplot as plt
from helpers import *

# Reading Data
keeping only announcements present in december, but not in november

In [2]:
dict_of_announcements = pd.read_pickle(r'parsed_announcements_december.pkl')
dict_of_announcements_old = pd.read_pickle(r'parsed_announcements_november.pkl')
old_announcements = [val['url'] for val in dict_of_announcements_old]
new_announcements = [val for val in dict_of_announcements if val['url'] not in old_announcements]
df = pd.DataFrame(new_announcements)

# Creating the dependant variabile
- removing listings meant as rentals
- standardizing the prices:
    - using the same currency (EUR)
    - adding the VAT where necessary (5% for properties under 140000€ & 19% for properties over that)

In [3]:
df = df.drop(df[df['price'].apply(lambda x: '/ lună' in x)].index).reset_index(drop=True)
df['eur_price'] = df['price'].apply(lambda x: get_eur_price(x))
df['eur_price'] = np.where(
    (df['eur_price'] <= 140000) & (df['price'].apply(lambda x: 'TVA' in x)),
    df['eur_price'] * 1.05,
    np.where(
        (df['eur_price'] > 140000) & (df['price'].apply(lambda x: 'TVA' in x)),
        df['eur_price'] * 1.19,
        df['eur_price']
    )
)

In [4]:
test_rent_removed(df)
test_column_has_no_null_values(df, 'eur_price')

Tests passed!
Tests passed!


# Creating independent variables

### Renaming columns
- translating from Romanian to English
- using a pythonic convention

In [5]:
original_cols = ['Nr. camere', 'Suprafaţă utilă', 'Compartimentare', 'Confort', 'Etaj', 'Nr. bucătării', 'Nr. băi', 'An construcţie', 'Structură rezistenţă', 'Tip imobil', 'Nr. balcoane', 'Nr. locuri parcare', 'Nr. garaje']
new_cols = ['rooms', 'surface_area', 'partitioning_type', 'comfort_type', 'floors', 'kitchens', 'bathrooms', 'building_year', 'building_structure', 'building_type', 'balconies', 'parking_spots', 'garages']
df.rename(columns={original_cols[i]: new_cols[i] for i in range(len(original_cols))}, inplace=True)

### Cleaning critical columns
- dropping null/invalid values
- changing to a numeric data type

In [6]:
df.drop(df[
    (df['rooms'].isnull()) | 
    (df['surface_area'].isnull()) | 
    (df['building_year'].isnull()) |
    (df['floors'].apply(lambda x: '/' not in x))
].index, inplace=True)
df['rooms'] = df['rooms'].astype(int)
df['surface_area'] = df['surface_area'].apply(lambda x: x.split(' ')[0].strip().replace(',', '.')).astype(float)
df.reset_index(drop=True, inplace=True)

In [7]:
test_column_has_no_null_values(df, 'rooms')
test_column_has_no_null_values(df, 'surface_area')

Tests passed!
Tests passed!


### Transforming partinion_type
The most common value ("decomandat") is by far the most numerous. To keep things simple two columns will be created:
- **partition_decomandat**
- **partition_other**

In [8]:
df['partitioning_type'].value_counts(dropna=False)

decomandat        668
semidecomandat    425
NaN                14
circular            3
nedecomandat        2
Name: partitioning_type, dtype: int64

In [9]:
df['partition_decomandat'] = (df['partitioning_type'] == 'decomandat').astype(int)
df['partition_other'] = (df['partitioning_type'] != 'decomandat').astype(int)

In [10]:
test_column_has_no_null_values(df, 'partition_decomandat')
test_column_has_no_null_values(df, 'partition_other')

Tests passed!
Tests passed!


### Transforming comfort_type
Keeping the following values:
- comfort_1
- lux
- comfort_other

In [11]:
df['comfort_type'].value_counts(dropna=False)

1      768
lux    287
NaN     38
2       17
3        2
Name: comfort_type, dtype: int64

In [12]:
df['comfort_other'] = (~df['comfort_type'].isin(['1', 'lux'])).astype(int)
df['comfort_1'] = (df['comfort_type'] == '1').astype(int)
df['comfort_lux'] = (df['comfort_type'] == 'lux').astype(int)

In [13]:
test_column_has_no_null_values(df, 'comfort_other')
test_column_has_no_null_values(df, 'comfort_1')
test_column_has_no_null_values(df, 'comfort_lux')

Tests passed!
Tests passed!
Tests passed!


### Transforming apartment floor intro dummies. Final values: first_floor, last_floor, other
- splitting the x/y values into floor & max_floor
- comparing floor with max_floor to get values for dummies
- creating dummies

In [14]:
floor_series = df['floors'].apply(lambda x: x.split('/')[0]).reset_index(drop=True)
max_floor_series = df['floors'].apply(lambda x: x.split('/')[-1].strip()).astype(int).reset_index(drop=True)
floor_series = pd.Series(
        np.where(
        floor_series.apply(lambda x: any([val in x.lower() for val in ['parter', 'demi']])),
        '0',
        np.where(
            floor_series.apply(lambda x: any([val in x.lower() for val in ['ultim', 'mansard']])),
            '99',
            floor_series
        )
    )
).apply(lambda x: re.findall(r'\d+', x)[0]).astype(int)
df['floor'] = np.where(
    floor_series == 0,
    'first_floor',
    np.where(
        floor_series >= max_floor_series,
        'last_floor',
        'middle_floor'
    )
)
df['max_floor'] = max_floor_series
df = pd.concat([df, pd.get_dummies(df['floor'].astype(pd.CategoricalDtype(categories=FLOOR_CATEGORIES)))], axis=1)

In [15]:
test_column_has_no_null_values(df, 'floor')
test_column_has_no_null_values(df, 'max_floor')
test_column_has_only_accepted_values(df, 'floor', FLOOR_CATEGORIES)

Tests passed!
Tests passed!
Tests passed!


### Transforming bathrooms
- replacing NaNs with the most common value, 1 bathroom
- changing the data type to int
- adding the ratio between the number of bathrooms & rooms as a column

In [16]:
df['bathrooms'].value_counts(dropna=False)

1      668
2      355
3       52
NaN     17
4       16
5        4
Name: bathrooms, dtype: int64

In [17]:
df['bathrooms'].fillna('1', inplace=True)
df['bathrooms'] = df['bathrooms'].astype(int)
df['bathrooms_ratio'] = df['bathrooms'] / df['rooms']

In [18]:
test_column_has_no_null_values(df, 'bathrooms')
test_column_has_no_null_values(df, 'bathrooms_ratio')

Tests passed!
Tests passed!


### Transforming building_year
- translating categories into English
- creating **not_finished** & **not_started** categories for new buildings
- assigning each year to its category
- creating dummies

In [19]:
df['building_year'] = df['building_year'].apply(lambda x: 'not_finished' if 'constructie' in x else x)
df['building_year'] = df['building_year'].apply(lambda x: 'not_started' if 'proiect' in x else x)
df['building_year'] = df['building_year'].apply(lambda x: x.replace('(finalizata)', '').strip())
df['building_year'] = df['building_year'].apply(lambda x: translate_building_year_values(x))
df['building_year'] = df['building_year'].apply(lambda x: get_building_year_category(x))
df = pd.concat([df, pd.get_dummies(df['building_year'].astype(pd.CategoricalDtype(categories=BUILDING_PERIODS)))], axis=1)

In [20]:
test_column_has_no_null_values(df, 'building_year')
test_column_has_only_accepted_values(df, 'building_year', BUILDING_PERIODS)

Tests passed!
Tests passed!


### Transforming building_structure
- fill missing values with distinct category
- translate the most common value into English
- unify all other categories into single category
- create dummies

In [21]:
df['building_structure'].value_counts(dropna=False)

beton       526
NaN         348
altele      188
caramida     50
Name: building_structure, dtype: int64

In [22]:
df['building_structure'].fillna('unknown_building_structure', inplace=True)
df['building_structure'].replace('beton', 'concrete_building_structure', inplace=True)
other_building_structures = [
    val for val in df['building_structure'].unique() 
    if val not in ['concrete_building_structure', 'unknown_building_structure']
]
df['building_structure'].replace(other_building_structures, 'other_building_structure', inplace=True)
df = pd.concat([df, pd.get_dummies(df['building_structure'].astype(pd.CategoricalDtype(categories=BUILDING_STRUCTURES)))], axis=1)

In [23]:
test_column_has_no_null_values(df, 'building_structure')
test_column_has_only_accepted_values(df, 'building_structure', BUILDING_STRUCTURES)

Tests passed!
Tests passed!


### Transforming balconies
- replace null values with 0
- create new binary variable to signify the presence of at least one balcony

In [24]:
df['balconies'].value_counts(dropna=False)

1              535
NaN            308
2              124
1 (închise)    102
2 (închise)     19
3               13
4                5
5                3
3 (închise)      2
11               1
Name: balconies, dtype: int64

In [25]:
df['has_balconies'] = (~df['balconies'].isnull()).astype(int)

In [26]:
test_column_has_no_null_values(df, 'has_balconies')

Tests passed!


### Transforming parking_spots & garages
- replace null with 0
- create new binary variable to signify the presence of at least one parking spot or garage

In [27]:
df['parking_spots'].value_counts(dropna=False)

NaN                    927
1                      139
2                       27
1 - Inclus în preț      12
2 - Incluse în preț      4
5                        1
99                       1
3                        1
Name: parking_spots, dtype: int64

In [28]:
df['garages'].value_counts(dropna=False)

NaN    1063
1        39
2         9
5         1
Name: garages, dtype: int64

In [29]:
df['has_parking_spots_or_garages'] = ((~df['garages'].isnull()) | (~df['parking_spots'].isnull())).astype(int)

In [30]:
test_column_has_no_null_values(df, 'has_parking_spots_or_garages')

Tests passed!


### Transforming neighborhood
- creating neighborhood column with values from each url
- creating dummies

In [31]:
df['area'] = df['url'].apply(lambda x: x.split('/')[-2].replace('-', '_')+'_area')
df = pd.concat([df, pd.get_dummies(df['area'].astype(pd.CategoricalDtype(categories=BUCHAREST_AREAS)))], axis=1)

In [32]:
test_column_has_no_null_values(df, 'area')

Tests passed!


### Transforming specifications
Based on the free text available with all listings, the following features can be easily added:
- the availability of floor heating
- exclusive reliance on district heating
- the availability of video surveillence in the building

In [33]:
df['has_floor_heating'] = df['specifications'].apply(lambda x: any(['incalzire prin pardoseala' in val.lower() for val in x])).astype(int)
df['only_district_heating'] = df['specifications'].apply(lambda x: not any([('centrala proprie' in val.lower()) | ('centrala imobil' in val.lower()) for val in x])).astype(int)
df['building_with_video_surveillance'] = df['specifications'].apply(lambda x: any(['supraveghere video' in val.lower() for val in x])).astype(int)

In [34]:
test_column_has_no_null_values(df, 'has_floor_heating')
test_column_has_no_null_values(df, 'only_district_heating')
test_column_has_no_null_values(df, 'building_with_video_surveillance')

Tests passed!
Tests passed!
Tests passed!


In [35]:
df[['eur_price', 'rooms', 'surface_area', 'bathrooms', 'partition_decomandat', 'partition_other', 'comfort_other', 'comfort_1', 'comfort_lux',
       'max_floor', 'first_floor', 'last_floor', 'middle_floor',
       'bathrooms_ratio', 'after_2010', 'before_1941', 'between_1941_1977',
       'between_1977_1990', 'between_1990_2000', 'between_2000_2010',
       'not_finished', 'not_started', 'concrete_building_structure',
       'other_building_structure', 'unknown_building_structure',
       'has_balconies', 'has_parking_spots_or_garages', 'has_floor_heating', 'only_district_heating',
       'building_with_video_surveillance', '1_mai_area',
       'agronomie_area', 'aviatiei_area', 'aviatorilor_area',
       'banu_manta_area', 'chibrit_area', 'domenii_area', 'dristor_area',
       'stefan_cel_mare_area', 'titulescu_area', 'turda_area']].to_csv(os.path.join('data', 'clean_data_december.csv'), index=False)