In [48]:
import pandas as pd
import numpy as np
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import glob
import geopandas as gpd
from geopy.distance import geodesic
from geopy import Point

# Grouping scraped datasets

In [49]:
path = '../scraping/datasets_scraped'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    frame = pd.read_csv(filename, index_col=None, header=0, decimal = ',')
    li.append(frame)

#concatenating all datasets and removing duplicate rows (there might be duplicate posts)
df = pd.concat(li, axis=0, ignore_index=True)
df.drop_duplicates(inplace = True)

df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,,6.0,
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"['Piscina', 'Elevador', 'Academia', 'Churrasqu...",3.5,
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"['Piscina', 'Elevador', 'Academia', 'Condomíni...",5.0,R$ 1.232
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"['Piscina', 'Elevador', 'Academia', 'Playgroun...",3.2,
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"['Elevador', 'Ar-condicionado', 'Salão de fest...",6.0,


In [50]:
df.shape

(4018, 9)

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4018 entries, 0 to 26300
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           4018 non-null   object
 1   address         4018 non-null   object
 2   area            4018 non-null   int64 
 3   bathrooms       4018 non-null   object
 4   bedrooms        4018 non-null   int64 
 5   parking_spots   4018 non-null   object
 6   extra_contents  3410 non-null   object
 7   rent            4018 non-null   object
 8   fee             1236 non-null   object
dtypes: int64(2), object(7)
memory usage: 313.9+ KB


# Data cleaning

In [52]:
#checking null values in columns
df.isna().sum()

title                0
address              0
area                 0
bathrooms            0
bedrooms             0
parking_spots        0
extra_contents     608
rent                 0
fee               2782
dtype: int64

Here we'll have two different approaches.

- For the *extra_contents* column we've already filled the null values with a empty list. That we'll be able to turn those lists into columns. 

- For the *fee* column we'll fill the NAs with 0, most of the posts that do not contain a fee description also say that the value is already contained in rent.

In [53]:
#removing currency simbols, filling NAs with 0 and turning column into integer
df['fee'] = df['fee'].str.replace('.','').str.replace('$','').str.replace('R','').str.strip().astype(float)
df['fee'] = df.fee.fillna(0)
df['fee'] = df.fee.astype(int)

#turning into integers columns and replacing strings
df['bathrooms'] = df.bathrooms.str.replace('--', '0')
df['bathrooms'] = df.bathrooms.astype(str).astype(int)
df['parking_spots'] = df.parking_spots.str.replace('--', '0')
df['parking_spots'] = df.parking_spots.astype(str).astype(int)

#filling nas with a empty list and turning string list into list
df.extra_contents.fillna('[]', inplace = True)
df["extra_contents"] = df["extra_contents"].apply(eval)

#turning column into integer
df['rent'] = df['rent'].str.replace('.','')
df['rent'] = df.rent.astype(int)

  


In [54]:
df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,[],6000,0
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"[Piscina, Elevador, Academia, Churrasqueira, P...",3500,0
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"[Piscina, Elevador, Academia, Condomínio fecha...",5000,1232
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"[Piscina, Elevador, Academia, Playground, Salã...",3200,0
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"[Elevador, Ar-condicionado, Salão de festas, A...",6000,0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4018 entries, 0 to 26300
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           4018 non-null   object
 1   address         4018 non-null   object
 2   area            4018 non-null   int64 
 3   bathrooms       4018 non-null   int32 
 4   bedrooms        4018 non-null   int64 
 5   parking_spots   4018 non-null   int32 
 6   extra_contents  4018 non-null   object
 7   rent            4018 non-null   int32 
 8   fee             4018 non-null   int32 
dtypes: int32(4), int64(2), object(3)
memory usage: 251.1+ KB


In [56]:
#checking if all nulls were removed
df.isna().sum()

title             0
address           0
area              0
bathrooms         0
bedrooms          0
parking_spots     0
extra_contents    0
rent              0
fee               0
dtype: int64

# Feature engineering

In [57]:
#defining if a property is a house or a apartment
df['property_type'] = df.title.apply(lambda x: x.split()[0].strip().lower())

#changing names from Brazilian Portuguese to English
df.replace({'property_type' : { 'apartamento' : 'apartment', 'casa' : 'house'}}, inplace = True)

In [58]:
#creating geopy objects

locator = Nominatim(user_agent = 'myGeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

#function to avoid timeout
#orginal code found on https://gis.stackexchange.com/questions/173569/avoid-time-out-error-nominatim-geopy-openstreetmap
def do_geocode(address, attempt=1, max_attempts=5):
    try:
        return locator.geocode(address, addressdetails=True)
    except GeocoderTimedOut:
        if attempt <= max_attempts:
            return do_geocode(address, attempt=attempt+1)
        raise

In [59]:
#getting location data from geocode
df['location'] = df['address'].apply(lambda x: do_geocode(x))

In [60]:
df.head()

Unnamed: 0,title,address,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,property_type,location
0,"Apartamento com 3 Quartos para Aluguel, 228m²","Rua dos Navegantes, 215 - Boa Viagem, Recife - PE",228,4,3,1,[],6000,0,apartment,"(215, Rua dos Navegantes, Boa Viagem, Recife, ..."
1,"Apartamento com 4 Quartos para Aluguel, 112m²","Rua Baltazar Passos, 260 - Boa Viagem, Recife ...",112,4,4,2,"[Piscina, Elevador, Academia, Churrasqueira, P...",3500,0,apartment,"(260, Rua Baltazar Passos, Boa Viagem, Recife,..."
2,"Apartamento com 4 Quartos para Aluguel, 160m²","Avenida Boa Viagem, 5822 - Boa Viagem, Recife ...",160,5,4,3,"[Piscina, Elevador, Academia, Condomínio fecha...",5000,1232,apartment,"(5822, Avenida Boa Viagem, Boa Viagem, Recife,..."
3,"Apartamento com 3 Quartos para Aluguel, 75m²","Tamarineira, Recife - PE",75,3,3,2,"[Piscina, Elevador, Academia, Playground, Salã...",3200,0,apartment,"(Tamarineira, Recife, Região Geográfica Imedia..."
4,"Apartamento com 5 Quartos para Aluguel, 200m²","Rua Setúbal, 278 - Boa Viagem, Recife - PE",200,5,5,3,"[Elevador, Ar-condicionado, Salão de festas, A...",6000,0,apartment,"(278, Rua Setúbal, Boa Viagem, Recife, Região ..."


In [61]:
df.isna().sum()

title               0
address             0
area                0
bathrooms           0
bedrooms            0
parking_spots       0
extra_contents      0
rent                0
fee                 0
property_type       0
location          208
dtype: int64

A few posts could not have their location identified by geopy, since we'll need latitude and longitude for our future model, we'll remove these posts.

In [62]:
#filtering out NAs in location column
df = df[df.location.notna()]

In [63]:
df.shape

(3810, 11)

In [64]:
#function to verify if there's a key to get from location column
def get_key(x, key):
    try:
        result = x.raw['address'][key]
    except KeyError:
        result ='unknown'
    return result    

In [65]:
#getting neighborhood and actual city
df['neighborhood'] = df.location.apply(lambda x: get_key(x,'suburb'))
df['city'] = df.location.apply(lambda x: get_key(x, 'city'))

#getting latitude and longitude
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)


#dropping unnecessary columns
df.drop(columns = ['location', 'title', 'address'], inplace = True)



Note that this project is built using Brazilian real estate market data, which means that **"neighborhood"** is an important information when it comes to addresses. Also we took advantage of the geocoding to get the **"neighborhood"** column since our **"address"** column had some imperfections that would only make it harder to extract the variable.

We'll remove cities that are different from Recife, which is the focus city for this project. By doing that we'll also be removing neighborhoods classified as "unknown", since those are neighborhoods from other cities wrongly classified as Recife's by the posts authors from the original website.

In [66]:
#checking "unknown" neighborhoods
df[df.neighborhood == 'unknown']

Unnamed: 0,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,property_type,neighborhood,city,point
14895,70,1,1,0,[],1565,0,house,unknown,Igarassu,"(-7.821819, -35.007712, 0.0)"


In [67]:
#checking cities in dataset
df.city.unique()

array(['Recife', 'unknown', 'Cabo de Santo Agostinho',
       'Jaboatão dos Guararapes', 'Vitória de Santo Antão', 'Olinda',
       'Igarassu'], dtype=object)

In [68]:
#filtering only Recife
df = df[df.city == 'Recife']
df.city.unique()

array(['Recife'], dtype=object)

In [69]:
#now we won't need the city column anymore, so we'll remove it
df.drop(columns = ['city'], inplace = True)

In [70]:
#checking if ther's still "unknown" neighborhoods
df[df.neighborhood == 'unknown']

Unnamed: 0,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,property_type,neighborhood,point


In [71]:
df.shape

(3799, 10)

In [72]:
df.head()

Unnamed: 0,area,bathrooms,bedrooms,parking_spots,extra_contents,rent,fee,property_type,neighborhood,point
0,228,4,3,1,[],6000,0,apartment,Boa Viagem,"(-8.1301011, -34.9001821, 0.0)"
1,112,4,4,2,"[Piscina, Elevador, Academia, Churrasqueira, P...",3500,0,apartment,Boa Viagem,"(-8.1438751, -34.9071533, 0.0)"
2,160,5,4,3,"[Piscina, Elevador, Academia, Condomínio fecha...",5000,1232,apartment,Boa Viagem,"(-8.1396537, -34.9032052, 0.0)"
3,75,3,3,2,"[Piscina, Elevador, Academia, Playground, Salã...",3200,0,apartment,Tamarineira,"(-8.0324159, -34.9010191, 0.0)"
4,200,5,5,3,"[Elevador, Ar-condicionado, Salão de festas, A...",6000,0,apartment,Boa Viagem,"(-8.1335303, -34.9020993, 0.0)"


In [73]:
#taking values inside lists on "extra_contents" and turning into dummy variables


#original code found on https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

def boolean_df(item_lists, unique_items):
# Create empty dict
    bool_dict = {}
    
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)
            
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)


#creating dummy columns for each variable inside the extra_contents column            
extra_contents_df = boolean_df(df.extra_contents, to_1D(df.extra_contents)).astype(int)

#using pandas concat to add the new column to our dataset
df = pd.concat([df, extra_contents_df], axis=1)
df.drop(columns = ['extra_contents'], inplace = True)

In [74]:
df.head()

Unnamed: 0,area,bathrooms,bedrooms,parking_spots,rent,fee,property_type,neighborhood,point,Piscina,...,Janela de alumínio,Piscina para adulto,Bicicletário,Estacionamento,Acesso para deficientes,Escada,Rampa,Ventilação natural,Imóvel de esquina,Gramado
0,228,4,3,1,6000,0,apartment,Boa Viagem,"(-8.1301011, -34.9001821, 0.0)",0,...,0,0,0,0,0,0,0,0,0,0
1,112,4,4,2,3500,0,apartment,Boa Viagem,"(-8.1438751, -34.9071533, 0.0)",1,...,0,0,0,0,0,0,0,0,0,0
2,160,5,4,3,5000,1232,apartment,Boa Viagem,"(-8.1396537, -34.9032052, 0.0)",1,...,0,0,0,0,0,0,0,0,0,0
3,75,3,3,2,3200,0,apartment,Tamarineira,"(-8.0324159, -34.9010191, 0.0)",1,...,0,0,0,0,0,0,0,0,0,0
4,200,5,5,3,6000,0,apartment,Boa Viagem,"(-8.1335303, -34.9020993, 0.0)",0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
df.columns

Index(['area', 'bathrooms', 'bedrooms', 'parking_spots', 'rent', 'fee',
       'property_type', 'neighborhood', 'point', 'Piscina', 'Elevador',
       'Academia', 'Churrasqueira', 'Playground', 'Condomínio fechado',
       'Salão de festas', 'Ar-condicionado', 'Aceita animais', 'Jardim',
       'Mobiliado', 'Varanda', 'Cozinha americana', 'Portaria 24h', 'Garagem',
       'Cozinha', 'Área de serviço', 'Quadra poliesportiva',
       'Armário na cozinha', 'Interfone', 'Circuito de segurança',
       'Mais de um andar', 'Segurança 24h', 'Vista para o mar', 'Quintal',
       'Sauna', 'TV a cabo', 'Conexão à internet', 'Espaço verde / Parque',
       'Salão de jogos', 'Vigia', 'Espaço gourmet', 'Lavanderia',
       'Sistema de alarme', 'Gerador elétrico', 'Varanda gourmet', 'Depósito',
       'Armário embutido', 'Recepção', 'Copa', 'Vista exterior',
       'Móvel planejado', 'Perto de vias de acesso', 'Cinema', 'Escritório',
       'Quadra de tênis', 'Closet', 'Pista de cooper', 'Lareira',


We've now added new dummy columns to our dataset using **"extra_contents"** column content in order to better understand the data.

In [76]:
#summing fee and rent column
df['rent'] = df['rent'] + df['fee']
df.drop(columns = ['fee'], inplace = True)

Most of the cases where the fee column is 0 is because this value is already included in the rent column. That's why we've summed up both columns.

In [77]:
#naming original columns of our dataframe except for rent
original_columns = ['property_type', 'area', 'bathrooms', 'bedrooms', 'parking_spots', 
       'neighborhood','point']

Now we'll check the correlation of the new dummy columns with the independent variable and try to reduce the amount of columns that we'll use.

In [78]:
#checking correlation of new columns to filter the important ones
new_columns_corr = abs(df.loc[:, ~df.columns.isin(original_columns)].corr()['rent']).sort_values(ascending = False)
new_columns_corr

rent                1.000000
Mais de um andar    0.030095
Mobiliado           0.029619
Churrasqueira       0.026553
Cozinha             0.020350
                      ...   
Closet              0.000618
Pista de cooper     0.000603
Vista exterior      0.000599
Vista para o mar    0.000495
Móvel planejado     0.000050
Name: rent, Length: 67, dtype: float64

In [79]:
print('Number of new columns:', len(new_columns_corr) - 1)

Number of new columns: 66


In [80]:
selected_new_columns = list(new_columns_corr[:6].keys())

Now we have 5 new columns that seem to have the best correlation with our independent variable (rent) out of 66 columns. By doing this we avoid that our future model may have too many features, increasing the chance of overfitting.

In [81]:
#putting all chosen columns in a single list
selected_columns = original_columns + selected_new_columns

#filtering our dataframe
df = df[selected_columns]
df

Unnamed: 0,property_type,area,bathrooms,bedrooms,parking_spots,neighborhood,point,rent,Mais de um andar,Mobiliado,Churrasqueira,Cozinha,Piscina
0,apartment,228,4,3,1,Boa Viagem,"(-8.1301011, -34.9001821, 0.0)",6000,0,0,0,0,0
1,apartment,112,4,4,2,Boa Viagem,"(-8.1438751, -34.9071533, 0.0)",3500,0,0,1,0,1
2,apartment,160,5,4,3,Boa Viagem,"(-8.1396537, -34.9032052, 0.0)",6232,0,0,0,0,1
3,apartment,75,3,3,2,Tamarineira,"(-8.0324159, -34.9010191, 0.0)",3200,0,0,0,0,1
4,apartment,200,5,5,3,Boa Viagem,"(-8.1335303, -34.9020993, 0.0)",6000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26276,apartment,105,2,3,1,Boa Viagem,"(-8.1373649, -34.9062272, 0.0)",2800,0,0,0,0,0
26277,apartment,90,3,3,2,Torre,"(-8.0448905, -34.908132, 0.0)",2700,0,0,1,0,1
26291,apartment,103,2,3,1,Graças,"(-8.0451664, -34.9007686, 0.0)",2100,0,0,0,0,0
26293,apartment,40,1,1,1,Parnamirim,"(-8.0331482, -34.9150779, 0.0)",1980,0,0,0,0,0


Since this is a project written in english, we'll rename the columns from Brazilian Portuguese to English.

In [82]:
#current columns
df.columns

Index(['property_type', 'area', 'bathrooms', 'bedrooms', 'parking_spots',
       'neighborhood', 'point', 'rent', 'Mais de um andar', 'Mobiliado',
       'Churrasqueira', 'Cozinha', 'Piscina'],
      dtype='object')

In [83]:
#renaming columns
df.columns = ['property_type', 'area', 'bathrooms', 'bedrooms', 'parking_spots',
       'neighborhood', 'point', 'rent', 'more_than_1_floor', 'furnished',
       'barbecue_grill', 'kitchen', 'pool']

In [84]:
df.head()

Unnamed: 0,property_type,area,bathrooms,bedrooms,parking_spots,neighborhood,point,rent,more_than_1_floor,furnished,barbecue_grill,kitchen,security
0,apartment,228,4,3,1,Boa Viagem,"(-8.1301011, -34.9001821, 0.0)",6000,0,0,0,0,0
1,apartment,112,4,4,2,Boa Viagem,"(-8.1438751, -34.9071533, 0.0)",3500,0,0,1,0,1
2,apartment,160,5,4,3,Boa Viagem,"(-8.1396537, -34.9032052, 0.0)",6232,0,0,0,0,1
3,apartment,75,3,3,2,Tamarineira,"(-8.0324159, -34.9010191, 0.0)",3200,0,0,0,0,1
4,apartment,200,5,5,3,Boa Viagem,"(-8.1335303, -34.9020993, 0.0)",6000,0,0,0,0,0


The last feature we'll create is a feature that measures the distance of the property to the beach. For that we'll use data provided by <a href='http://dados.recife.pe.gov.br/dataset/cobertura-da-terra'>Dados Recife</a> that contains the geometry for Recife's beach area.

In [85]:
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon

#function to import file and convert multipolygons to polygons. Original code found on https://gist.github.com/mhweber/cf36bb4e09df9deee5eb54dc6be74d26
def explode(indata):
    indf = gpd.GeoDataFrame.from_file(indata)
    outdf = gpd.GeoDataFrame(columns=indf.columns)
    for idx, row in indf.iterrows():
        if type(row.geometry) == Polygon:
            outdf = outdf.append(row,ignore_index=True)
        if type(row.geometry) == MultiPolygon:
            multdf = gpd.GeoDataFrame(columns=indf.columns)
            recs = len(row.geometry)
            multdf = multdf.append([row]*recs,ignore_index=True)
            for geom in range(recs):
                multdf.loc[geom,'geometry'] = row.geometry[geom]
            outdf = outdf.append(multdf,ignore_index=True)
    return outdf

In [86]:
#importing file
recife_area = explode('cobertura_da_terra_2013.geojson')
recife_area.head()

Unnamed: 0,objectid,classe01,classe02,classe03,st_area_sh,geometry
0,36844,fv,ff,ffinicial,5333.258894,"POLYGON ((-34.88063 -8.02061, -34.88029 -8.020..."
1,36854,fv,ff,ffinicial,1351.903762,"POLYGON ((-34.89308 -8.01953, -34.89304 -8.019..."
2,36859,fv,ff,ffinicial,1078.629309,"POLYGON ((-34.88110 -8.01995, -34.88122 -8.020..."
3,36868,fv,ff,ffinicial,5731.977907,"POLYGON ((-34.87469 -8.01835, -34.87493 -8.019..."
4,36873,fv,ff,ffinicial,801.068239,"POLYGON ((-34.88423 -8.01795, -34.88430 -8.018..."


In [87]:
recife_area = recife_area.loc[recife_area.classe03 == 'praia']

In Portuguese **"praia"** means **"beach"**, that's why we'll filter all the dataframe where the column "classe03" is equal to "praia".

In [88]:
from shapely.geometry import Point

#convert multipoint into point. Original code found on https://gis.stackexchange.com/questions/302430/polygon-to-point-in-geopandas
col = recife_area.columns.tolist()
print(col)

# new GeoDataFrame with same columns
nodes = gpd.GeoDataFrame(columns=col)

# Extraction of the polygon nodes and attributes values from polys and integration into the new GeoDataFrame
for index, row in recife_area.iterrows():
    for j in list(row['geometry'].exterior.coords): 
        nodes = nodes.append({'objectid': int(row['objectid']), 'classe01':row['classe01'],'classe02':row['classe02'], 'classe03':row['classe03'], 'st_area_sh': float(row['st_area_sh']),'geometry':Point(j) },ignore_index=True)
nodes.head()

['objectid', 'classe01', 'classe02', 'classe03', 'st_area_sh', 'geometry']


Unnamed: 0,objectid,classe01,classe02,classe03,st_area_sh,geometry
0,26098,areasemcob,areasemcob,praia,601915.43671,POINT (-34.87930 -8.08855)
1,26098,areasemcob,areasemcob,praia,601915.43671,POINT (-34.87939 -8.08921)
2,26098,areasemcob,areasemcob,praia,601915.43671,POINT (-34.87957 -8.09037)
3,26098,areasemcob,areasemcob,praia,601915.43671,POINT (-34.88001 -8.09195)
4,26098,areasemcob,areasemcob,praia,601915.43671,POINT (-34.88099 -8.09402)


In [89]:
import csv

#storing beach coordinates
coord_list = [(y,x) for y,x in zip(nodes['geometry'].y , nodes['geometry'].x)]

#saving to use in the future api
with open('coord_list', 'wb') as myfile:
    wr = csv.writer(coord_list, quoting=csv.QUOTE_ALL)
    wr.writerow(mylist)

In [90]:
#function to store the minimum distance of the property to the beach
def get_distance(x):
    values = []
    for i in coord_list:
        values.append(geodesic(x, i).km)
    return min(values)

In [91]:
#get beach distance
df['beach_distance'] = df.point.apply(lambda x: get_distance(x))

#getting latitude and longitude columns for visualization and dropping unnecessary columns
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)
df.drop(columns = ['altitude', 'point'], inplace = True)

In [92]:
df.head()

Unnamed: 0,property_type,area,bathrooms,bedrooms,parking_spots,neighborhood,rent,more_than_1_floor,furnished,barbecue_grill,kitchen,security,beach_distance,latitude,longitude
0,apartment,228,4,3,1,Boa Viagem,6000,0,0,0,0,0,0.157674,-8.130101,-34.900182
1,apartment,112,4,4,2,Boa Viagem,3500,0,0,1,0,1,0.27838,-8.143875,-34.907153
2,apartment,160,5,4,3,Boa Viagem,6232,0,0,0,0,1,0.120035,-8.139654,-34.903205
3,apartment,75,3,3,2,Tamarineira,3200,0,0,0,0,1,3.835353,-8.032416,-34.901019
4,apartment,200,5,5,3,Boa Viagem,6000,0,0,0,0,0,0.166399,-8.13353,-34.902099


In [93]:
df.to_csv('cleaned_df.csv', index = False)