In [34]:
import pandas as pd
import numpy as np
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import glob

In [35]:
path = '../scraping/datasets_scraped'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    frame = pd.read_csv(filename, index_col=None, header=0)
    li.append(frame)

#concatenating all datasets and removing duplicate rows (there might be duplicate posts)
df = pd.concat(li, axis=0, ignore_index=True)
df.drop_duplicates(inplace = True)

df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228.0,4,3.0,1,,6.0,
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112.0,4,4.0,2,"['Piscina', 'Elevador', 'Academia', 'Churrasqu...",3.5,
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160.0,5,4.0,3,"['Piscina', 'Elevador', 'Academia', 'Condomíni...",5.0,R$ 1.232
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75.0,3,3.0,2,"['Piscina', 'Elevador', 'Academia', 'Playgroun...",3.2,
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200.0,5,5.0,3,"['Elevador', 'Ar-condicionado', 'Salão de fest...",6.0,


In [36]:
df.shape

(6083, 9)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6083 entries, 0 to 6350
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6079 non-null   object 
 1   address         6079 non-null   object 
 2   area            6079 non-null   float64
 3   bathrooms       6079 non-null   object 
 4   bedrooms        6079 non-null   float64
 5   parking_spots   6079 non-null   object 
 6   extra_contents  5250 non-null   object 
 7   rent            6079 non-null   float64
 8   fee             1940 non-null   object 
dtypes: float64(3), object(6)
memory usage: 475.2+ KB


# Data cleaning

In [None]:
#checking null values in columns
df.isna().sum()

Here we'll have two different approaches.

- For the *extra_contents* column we've already filled the null values with a empty list. That way in the EDA we'll be able to turn those lists into columns. 

- For the *fee* column we'll fill the NAs with 0, most of the posts that do not contain a fee description also say that the value is already contained in rent.

In [None]:
#removing currency simbols, filling NAs with 0 and turning column into integer
df['fee'] = df['fee'].str.replace('.','').str.replace('$','').str.replace('R','').str.strip().astype(float)
df['fee'] = df.fee.fillna(0)
df['fee'] = df.fee.astype(int)

#turning into integers columns and replacing strings
df['bathrooms'] = df.bathrooms.str.replace('--', '0')
df['bathrooms'] = df.bathrooms.astype(str).astype(int)
df['parking_spots'] = df.parking_spots.str.replace('--', '0')
df['parking_spots'] = df.parking_spots.astype(str).astype(int)

#filling nas with a empty list and turning string list into list
df.extra_contents.fillna('[]', inplace = True)
df["extra_contents"] = df["extra_contents"].apply(eval)

#correcting the decimal places in the rent column and turning it into integer
df['rent'] = df.rent * 1000
df['rent'] = df.rent.astype(int)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [3]:
df.title.str.split(' ', 1).str[0].unique()

array(['Apartamento'], dtype=object)

Here we can also see that all the properties have been classified as apartments.

# Feature engineering

In [4]:
#df['address'] = df.address.str.rsplit("-", 1).str[0]
df['address'] = df.address.str.strip()

In [5]:
locator = Nominatim(user_agent = 'myGeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=2)

In [6]:
#getting location data from geocode
df['location'] = df['address'].apply(locator.geocode)

In [8]:
df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,location
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,[],6000,0,"(215, Rua dos Navegantes, Boa Viagem, Recife, ..."
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"['Piscina', 'Elevador', 'Academia', 'Churrasqu...",3500,0,"(260, Rua Baltazar Passos, Boa Viagem, Recife,..."
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"['Piscina', 'Elevador', 'Academia', 'Condomíni...",5000,1232,"(5822, Avenida Boa Viagem, Boa Viagem, Recife,..."
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"['Piscina', 'Elevador', 'Academia', 'Playgroun...",3200,0,"(Tamarineira, Recife, Região Geográfica Imedia..."
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"['Elevador', 'Ar-condicionado', 'Salão de fest...",6000,0,"(278, Rua Setúbal, Boa Viagem, Recife, Região ..."


In [10]:
df.isna().sum()

title               0
address             0
area                0
bathrooms           0
bedrooms            0
parking_spots       0
extra_contents      0
rent                0
fee                 0
location          165
dtype: int64

A few posts of our data could not have their location identified by geopy, since we'll need latitude and longitude for our future model, we'll remove those posts.

In [15]:
#filtering out NAs in location column
df = df[df.location.notna()]

In [18]:
#getting latitude and longitude
df['neighborhood'] = df.location.raw["address"]["suburb"]
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

#dropping unnecessary columns
df.drop(columns = ['location', 'point', 'altitude', 'title', 'address'], inplace = True)

Note that this project is built using Brazilian real estate market data, wich means that neighborhood is an important information when it comes to addresses. Also we took advantage of the geocoding to get the neighborhood column since our address column had some imperfections that would only make it harder to extract the variable.

In [19]:
df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,latitude,longitude
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,[],6000,0,-8.130101,-34.900182
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"['Piscina', 'Elevador', 'Academia', 'Churrasqu...",3500,0,-8.143875,-34.907153
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"['Piscina', 'Elevador', 'Academia', 'Condomíni...",5000,1232,-8.139654,-34.903205
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"['Piscina', 'Elevador', 'Academia', 'Playgroun...",3200,0,-8.032416,-34.901019
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"['Elevador', 'Ar-condicionado', 'Salão de fest...",6000,0,-8.13353,-34.902099


In [54]:
#taking values inside lists on "extra_contents" and turning into dummy variables
#code found on https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173

def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

def boolean_df(item_lists, unique_items):
# Create empty dict
    bool_dict = {}
    
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)
            
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)


#creating dummy columns for each variable inside the extra_contents column            
extra_contents_df = boolean_df(df.extra_contents, to_1D(df.extra_contents)).astype(int)

#using pandas concat to add the new column to our dataset
df = pd.concat([df, extra_contents_df], axis=1)
df.drop(columns = ['extra_contents'], inplace = True)

In [56]:
df.columns

Index(['title', 'address', 'area', 'bathrooms', 'bedrooms', 'parking_spots',
       'rent', 'fee', 'latitude', 'longitude', 'Piscina', 'Elevador',
       'Academia', 'Churrasqueira', 'Playground', 'Condomínio fechado',
       'Salão de festas', 'Ar-condicionado', 'Aceita animais', 'Jardim',
       'Mobiliado', 'Varanda', 'Cozinha americana', 'Portaria 24h', 'Garagem',
       'Cozinha', 'Área de serviço', 'Quadra poliesportiva',
       'Armário na cozinha', 'Interfone', 'Circuito de segurança',
       'Mais de um andar', 'Segurança 24h', 'Vista para o mar', 'Quintal',
       'Sauna', 'TV a cabo', 'Conexão à internet', 'Espaço verde / Parque',
       'Salão de jogos', 'Vigia', 'Espaço gourmet', 'Lavanderia',
       'Sistema de alarme', 'Gerador elétrico', 'Varanda gourmet', 'Depósito',
       'Armário embutido', 'Recepção', 'Copa', 'Vista exterior',
       'Móvel planejado', 'Perto de vias de acesso', 'Cinema', 'Escritório',
       'Quadra de tênis', 'Closet', 'Pista de cooper', 'Lareira

We've now added 58 dummy columns to our dataset with extra contents about the apartments that will help us understand the data.

In [73]:
#summing fee and rent column

df['rent'] = df['rent'] + df['fee']
df.drop(columns = ['fee'], inplace = True)

Most of the cases where the fee column is 0 is because this value is already included in the rent column. That's why we've summed up both columns.