In [1]:
import pandas as pd
import numpy as np
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import glob

# Grouping scraped datasets

In [2]:
path = '../scraping/datasets_scraped'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    frame = pd.read_csv(filename, index_col=None, header=0)
    li.append(frame)

#concatenating all datasets and removing duplicate rows (there might be duplicate posts)
df = pd.concat(li, axis=0, ignore_index=True)
df.drop_duplicates(inplace = True)

df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,,6.0,
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"['Piscina', 'Elevador', 'Academia', 'Churrasqu...",3.5,
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"['Piscina', 'Elevador', 'Academia', 'Condomíni...",5.0,R$ 1.232
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"['Piscina', 'Elevador', 'Academia', 'Playgroun...",3.2,
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"['Elevador', 'Ar-condicionado', 'Salão de fest...",6.0,


In [3]:
df.shape

(3689, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3689 entries, 0 to 15950
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3689 non-null   object 
 1   address         3689 non-null   object 
 2   area            3689 non-null   int64  
 3   bathrooms       3689 non-null   object 
 4   bedrooms        3689 non-null   int64  
 5   parking_spots   3689 non-null   object 
 6   extra_contents  3130 non-null   object 
 7   rent            3689 non-null   float64
 8   fee             1130 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 288.2+ KB


# Data cleaning

In [5]:
#checking null values in columns
df.isna().sum()

title                0
address              0
area                 0
bathrooms            0
bedrooms             0
parking_spots        0
extra_contents     559
rent                 0
fee               2559
dtype: int64

Here we'll have two different approaches.

- For the *extra_contents* column we've already filled the null values with a empty list. That way in the EDA we'll be able to turn those lists into columns. 

- For the *fee* column we'll fill the NAs with 0, most of the posts that do not contain a fee description also say that the value is already contained in rent.

In [6]:
#removing currency simbols, filling NAs with 0 and turning column into integer
df['fee'] = df['fee'].str.replace('.','').str.replace('$','').str.replace('R','').str.strip().astype(float)
df['fee'] = df.fee.fillna(0)
df['fee'] = df.fee.astype(int)

#turning into integers columns and replacing strings
df['bathrooms'] = df.bathrooms.str.replace('--', '0')
df['bathrooms'] = df.bathrooms.astype(str).astype(int)
df['parking_spots'] = df.parking_spots.str.replace('--', '0')
df['parking_spots'] = df.parking_spots.astype(str).astype(int)

#filling nas with a empty list and turning string list into list
df.extra_contents.fillna('[]', inplace = True)
df["extra_contents"] = df["extra_contents"].apply(eval)

#correcting the decimal places in the rent column and turning it into integer
df['rent'] = df.rent * 1000
df['rent'] = df.rent.astype(int)

  


In [7]:
df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,[],6000,0
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"[Piscina, Elevador, Academia, Churrasqueira, P...",3500,0
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"[Piscina, Elevador, Academia, Condomínio fecha...",5000,1232
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"[Piscina, Elevador, Academia, Playground, Salã...",3200,0
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"[Elevador, Ar-condicionado, Salão de festas, A...",6000,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3689 entries, 0 to 15950
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           3689 non-null   object
 1   address         3689 non-null   object
 2   area            3689 non-null   int64 
 3   bathrooms       3689 non-null   int32 
 4   bedrooms        3689 non-null   int64 
 5   parking_spots   3689 non-null   int32 
 6   extra_contents  3689 non-null   object
 7   rent            3689 non-null   int32 
 8   fee             3689 non-null   int32 
dtypes: int32(4), int64(2), object(3)
memory usage: 230.6+ KB


In [9]:
#checking if all nulls were removed
df.isna().sum()

title             0
address           0
area              0
bathrooms         0
bedrooms          0
parking_spots     0
extra_contents    0
rent              0
fee               0
dtype: int64

# Feature engineering

In [10]:
#defining if a property is a house or a apartment
df['property_type'] = df.title.apply(lambda x: x.split()[0].strip().lower())

#changing names from Brazilian Portuguese to English
df.replace({'property_type' : { 'apartamento' : 'apartment', 'casa' : 'house'}}, inplace = True)

In [11]:
#creating geopy objects and function to avoid timeout

locator = Nominatim(user_agent = 'myGeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

#code found on https://gis.stackexchange.com/questions/173569/avoid-time-out-error-nominatim-geopy-openstreetmap
def do_geocode(address, attempt=1, max_attempts=5):
    try:
        return locator.geocode(address, addressdetails=True)
    except GeocoderTimedOut:
        if attempt <= max_attempts:
            return do_geocode(address, attempt=attempt+1)
        raise

In [12]:
#getting location data from geocode
df['location'] = df['address'].apply(lambda x: do_geocode(x))

In [13]:
df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,property_type,location
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,[],6000,0,apartment,"(215, Rua dos Navegantes, Boa Viagem, Recife, ..."
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"[Piscina, Elevador, Academia, Churrasqueira, P...",3500,0,apartment,"(260, Rua Baltazar Passos, Boa Viagem, Recife,..."
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"[Piscina, Elevador, Academia, Condomínio fecha...",5000,1232,apartment,"(5822, Avenida Boa Viagem, Boa Viagem, Recife,..."
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"[Piscina, Elevador, Academia, Playground, Salã...",3200,0,apartment,"(Tamarineira, Recife, Região Geográfica Imedia..."
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"[Elevador, Ar-condicionado, Salão de festas, A...",6000,0,apartment,"(278, Rua Setúbal, Boa Viagem, Recife, Região ..."


In [14]:
df.isna().sum()

title               0
address             0
area                0
bathrooms           0
bedrooms            0
parking_spots       0
extra_contents      0
rent                0
fee                 0
property_type       0
location          188
dtype: int64

A few posts could not have their location identified by geopy, since we'll need latitude and longitude for our future model, we'll remove these posts.

In [15]:
#filtering out NAs in location column
df = df[df.location.notna()]

In [16]:
df.shape

(3501, 11)

In [17]:
#function to verify if there's a key to get out of location column
def get_key(x, key):
    try:
        result = x.raw['address'][key]
    except KeyError:
        result ='unknown'
    return result    

In [18]:
#getting neighborhood and actual city
df['neighborhood'] = df.location.apply(lambda x: get_key(x,'suburb'))
df['city'] = df.location.apply(lambda x: get_key(x, 'city'))

#getting latitude and longitude
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

#dropping unnecessary columns
df.drop(columns = ['location', 'point', 'altitude', 'title', 'address'], inplace = True)



Note that this project is built using Brazilian real estate market data, which means that "neighborhood" is an important information when it comes to addresses. Also we took advantage of the geocoding to get the "neighborhood" column since our "address" column had some imperfections that would only make it harder to extract the variable.

We'll remove cities that are different from Recife, which is the main city for this project. By doing that we'll also be removing neighborhoods classified as "unknown", since those are neighborhoods from other cities wrongly classified as Recife's.

In [28]:
#checking "unknown" neighborhoods
df[df.neighborhood == 'unknown']

Unnamed: 0,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,property_type,neighborhood,city,latitude,longitude


In [23]:
#checking cities in dataset
df.city.unique()

array(['Recife', 'unknown', 'Cabo de Santo Agostinho',
       'Jaboatão dos Guararapes', 'Vitória de Santo Antão', 'Olinda',
       'Igarassu'], dtype=object)

In [24]:
#filtering only Recife
df = df[df.city == 'Recife']
df.city.unique()

array(['Recife'], dtype=object)

In [30]:
#now we won't need the city column anymore, so we'll remove it
df.drop(columns = ['city'], inplace = True)

In [None]:
#checking if ther's still "unknown" neighborhoods
df[df.neighborhood == 'unknown']

In [31]:
df.shape

(3490, 11)

In [33]:
df.head()

Unnamed: 0,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,property_type,neighborhood,latitude,longitude
0,228,4,3,1,[],6000,0,apartment,Boa Viagem,-8.130101,-34.900182
1,112,4,4,2,"[Piscina, Elevador, Academia, Churrasqueira, P...",3500,0,apartment,Boa Viagem,-8.143875,-34.907153
2,160,5,4,3,"[Piscina, Elevador, Academia, Condomínio fecha...",5000,1232,apartment,Boa Viagem,-8.139654,-34.903205
3,75,3,3,2,"[Piscina, Elevador, Academia, Playground, Salã...",3200,0,apartment,Tamarineira,-8.032416,-34.901019
4,200,5,5,3,"[Elevador, Ar-condicionado, Salão de festas, A...",6000,0,apartment,Boa Viagem,-8.13353,-34.902099


In [34]:
#taking values inside lists on "extra_contents" and turning into dummy variables


#code found on https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

def boolean_df(item_lists, unique_items):
# Create empty dict
    bool_dict = {}
    
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)
            
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)


#creating dummy columns for each variable inside the extra_contents column            
extra_contents_df = boolean_df(df.extra_contents, to_1D(df.extra_contents)).astype(int)

#using pandas concat to add the new column to our dataset
df = pd.concat([df, extra_contents_df], axis=1)
df.drop(columns = ['extra_contents'], inplace = True)

In [35]:
df.head()

Unnamed: 0,area,bathrooms,bedrooms,parking_spots,rent,fee,property_type,neighborhood,latitude,longitude,...,Sala de jantar,Janela de alumínio,Piscina para adulto,Bicicletário,Estacionamento,Acesso para deficientes,Escada,Rampa,Ventilação natural,Imóvel de esquina
0,228,4,3,1,6000,0,apartment,Boa Viagem,-8.130101,-34.900182,...,0,0,0,0,0,0,0,0,0,0
1,112,4,4,2,3500,0,apartment,Boa Viagem,-8.143875,-34.907153,...,0,0,0,0,0,0,0,0,0,0
2,160,5,4,3,5000,1232,apartment,Boa Viagem,-8.139654,-34.903205,...,0,0,0,0,0,0,0,0,0,0
3,75,3,3,2,3200,0,apartment,Tamarineira,-8.032416,-34.901019,...,0,0,0,0,0,0,0,0,0,0
4,200,5,5,3,6000,0,apartment,Boa Viagem,-8.13353,-34.902099,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df.columns

Index(['area', 'bathrooms', 'bedrooms', 'parking_spots', 'rent', 'fee',
       'property_type', 'neighborhood', 'latitude', 'longitude', 'Piscina',
       'Elevador', 'Academia', 'Churrasqueira', 'Playground',
       'Condomínio fechado', 'Salão de festas', 'Ar-condicionado',
       'Aceita animais', 'Jardim', 'Mobiliado', 'Varanda', 'Cozinha americana',
       'Portaria 24h', 'Garagem', 'Cozinha', 'Área de serviço',
       'Quadra poliesportiva', 'Armário na cozinha', 'Interfone',
       'Circuito de segurança', 'Mais de um andar', 'Segurança 24h',
       'Vista para o mar', 'Quintal', 'Sauna', 'TV a cabo',
       'Conexão à internet', 'Espaço verde / Parque', 'Salão de jogos',
       'Vigia', 'Espaço gourmet', 'Lavanderia', 'Sistema de alarme',
       'Gerador elétrico', 'Varanda gourmet', 'Depósito', 'Armário embutido',
       'Recepção', 'Copa', 'Vista exterior', 'Móvel planejado',
       'Perto de vias de acesso', 'Cinema', 'Escritório', 'Quadra de tênis',
       'Closet', 'Pista 

We've now added new dummy columns to our dataset using "extra_contents" column content in order to better understand the data.

In [37]:
#summing fee and rent column
df['rent'] = df['rent'] + df['fee']
df.drop(columns = ['fee'], inplace = True)

Most of the cases where the fee column is 0 is because this value is already included in the rent column. That's why we've summed up both columns.

In [38]:
#naming original columns of our dataframe except for rent
original_columns = ['property_type', 'area', 'bathrooms', 'bedrooms', 'parking_spots', 
       'neighborhood', 'latitude', 'longitude']

Now we'll check the correlation of the new dummy columns with the independent variable and try to reduce the amount of columns that we'll use.

In [39]:
#checking correlation of new columns to filter the important ones
new_columns_corr = abs(df.loc[:, ~df.columns.isin(original_columns)].corr()['rent']).sort_values(ascending = False)
new_columns_corr

rent                  1.000000
Cozinha               0.247878
Piscina               0.197260
Área de serviço       0.180409
Elevador              0.165110
                        ...   
Escada                0.003155
Salão de jogos        0.003136
Rampa                 0.002960
Ventilação natural    0.002960
Imóvel de esquina     0.002960
Name: rent, Length: 66, dtype: float64

In [40]:
print('Number of new columns:', len(new_columns_corr) - 1)

Number of new columns: 65


In [41]:
#Filtering the ones that have correlation above 0.15
new_columns_corr[new_columns_corr > 0.15]


rent               1.000000
Cozinha            0.247878
Piscina            0.197260
Área de serviço    0.180409
Elevador           0.165110
Name: rent, dtype: float64

In [42]:
selected_new_columns = list(new_columns_corr[new_columns_corr > 0.15].keys())

Now we have 4 new columns that seem to have the best correlation with our independent variable (rent) out of 65 columns. By doing this we avoid that our future model may have too many features, increasing the chance of overfitting.

In [43]:
#putting all chosen columns in a single list
selected_columns = original_columns + selected_new_columns

#filtering our dataframe
df = df[selected_columns]
df

Unnamed: 0,property_type,area,bathrooms,bedrooms,parking_spots,neighborhood,latitude,longitude,rent,Cozinha,Piscina,Área de serviço,Elevador
0,apartment,228,4,3,1,Boa Viagem,-8.130101,-34.900182,6000,0,0,0,0
1,apartment,112,4,4,2,Boa Viagem,-8.143875,-34.907153,3500,0,1,0,1
2,apartment,160,5,4,3,Boa Viagem,-8.139654,-34.903205,6232,0,1,0,1
3,apartment,75,3,3,2,Tamarineira,-8.032416,-34.901019,3200,0,1,0,1
4,apartment,200,5,5,3,Boa Viagem,-8.133530,-34.902099,6000,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15229,house,272,3,3,4,Espinheiro,-8.041593,-34.887360,4000,1,0,1,0
15764,house,189,1,4,2,Boa Viagem,-8.138110,-34.913419,12000,1,1,0,0
15795,house,480,4,6,5,Graças,-8.038453,-34.900730,12000,0,0,0,0
15946,apartment,40,1,1,1,Boa Viagem,-8.110973,-34.893948,2120,0,1,0,1


Since this is a project written in english, we'll rename the columns from Brazilian Portuguese to English.

In [45]:
#current columns
df.columns

Index(['property_type', 'area', 'bathrooms', 'bedrooms', 'parking_spots',
       'neighborhood', 'latitude', 'longitude', 'rent', 'Cozinha', 'Piscina',
       'Área de serviço', 'Elevador'],
      dtype='object')

In [46]:
#renaming columns
df.columns = ['property_type', 'area', 'bathrooms', 'bedrooms', 'parking_spots',
       'neighborhood', 'latitude', 'longitude', 'rent', 'kitchen', 'pool',
       'service_area', 'elevator']

In [47]:
df.head()

Unnamed: 0,property_type,area,bathrooms,bedrooms,parking_spots,neighborhood,latitude,longitude,rent,kitchen,pool,service_area,elevator
0,apartment,228,4,3,1,Boa Viagem,-8.130101,-34.900182,6000,0,0,0,0
1,apartment,112,4,4,2,Boa Viagem,-8.143875,-34.907153,3500,0,1,0,1
2,apartment,160,5,4,3,Boa Viagem,-8.139654,-34.903205,6232,0,1,0,1
3,apartment,75,3,3,2,Tamarineira,-8.032416,-34.901019,3200,0,1,0,1
4,apartment,200,5,5,3,Boa Viagem,-8.13353,-34.902099,6000,0,0,0,1
