## Notebook for Parsing visualizaton and exploration

In [35]:
import pandas as pd

df = pd.read_csv("../../../data/scraped_data.csv")
if (df.columns[0] == "Unnamed: 0"): df = df.drop(df.columns[0], axis=1)
df.head()

Unnamed: 0,id,addresses,titles,areas,bedrooms,bathrooms,parkingSpots,prices
0,2681785550,"Rua Professor Aristides Novis, 7 - Federação, ...","Casa com 3 Quartos à Venda, 100m²",100,3,2,1,R$ 55.000
1,2529929204,"Rua Parambu, 431 - Santa Teresa, Salvador - BA",RESIDENCIAL VERSAILLES,40-61,1-2,1-2,1-2,A partir de R$ 289.000
2,2671293909,"Rua do Amparo, 89 - Lobato, Salvador - BA","Casa com 2 Quartos à Venda, 50m²",50,2,1,--,R$ 50.000
3,2681296717,"Avenida Luís Viana Filho, 10 - Alphaville I, S...","Apartamento com 2 Quartos à Venda, 91m²",91,2,4,2,R$ 840.000
4,2680287503,"2ª Travessa Gandarela, 100 - Cosme de Farias, ...","Casa com 3 Quartos à Venda, 55m²",55,3,2,--,R$ 94.890


In [36]:
df = df.drop_duplicates(subset=["id"])
df = df.dropna(subset=["prices"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110 entries, 0 to 179
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            110 non-null    int64 
 1   addresses     110 non-null    object
 2   titles        110 non-null    object
 3   areas         110 non-null    object
 4   bedrooms      110 non-null    object
 5   bathrooms     110 non-null    object
 6   parkingSpots  110 non-null    object
 7   prices        110 non-null    object
dtypes: int64(1), object(7)
memory usage: 7.7+ KB


In [37]:
df['type'] = ''
# Drop offices, rooms and other possible non-residential titles
df = df[~df['titles'].str.contains('sala', case=False)]

df.loc[df['titles'].str.contains('casa', case=False), 'type'] = 'house'
df.loc[df['titles'].str.contains('apartamento|residencial', case=False), 'type'] = 'apartment'

df = df.drop("titles", axis=1)

df.head()

Unnamed: 0,id,addresses,areas,bedrooms,bathrooms,parkingSpots,prices,type
0,2681785550,"Rua Professor Aristides Novis, 7 - Federação, ...",100,3,2,1,R$ 55.000,house
1,2529929204,"Rua Parambu, 431 - Santa Teresa, Salvador - BA",40-61,1-2,1-2,1-2,A partir de R$ 289.000,apartment
2,2671293909,"Rua do Amparo, 89 - Lobato, Salvador - BA",50,2,1,--,R$ 50.000,house
3,2681296717,"Avenida Luís Viana Filho, 10 - Alphaville I, S...",91,2,4,2,R$ 840.000,apartment
4,2680287503,"2ª Travessa Gandarela, 100 - Cosme de Farias, ...",55,3,2,--,R$ 94.890,house


In [38]:
neighborhood = df.addresses.str.split(" - ", expand=True)
neighborhood = neighborhood[neighborhood.columns[1]]
neighborhood = neighborhood.str.split(",", expand=True)
neighborhood = neighborhood[neighborhood.columns[0]]
neighborhood = neighborhood.rename("neighborhood")
neighborhood

0            Federação
1         Santa Teresa
2               Lobato
3         Alphaville I
4      Cosme de Farias
            ...       
174          Amaralina
175           Paralela
176          Patamares
178              Imbuí
179      Sete de Abril
Name: neighborhood, Length: 108, dtype: object

In [39]:
df = df.drop("addresses", axis=1)
df = pd.concat([df, neighborhood], axis=1)
df.head()

Unnamed: 0,id,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood
0,2681785550,100,3,2,1,R$ 55.000,house,Federação
1,2529929204,40-61,1-2,1-2,1-2,A partir de R$ 289.000,apartment,Santa Teresa
2,2671293909,50,2,1,--,R$ 50.000,house,Lobato
3,2681296717,91,2,4,2,R$ 840.000,apartment,Alphaville I
4,2680287503,55,3,2,--,R$ 94.890,house,Cosme de Farias


In [40]:
df.prices = df.prices.str.strip(" ")
df.prices = df.prices.str.replace("R$ ", "")
df.prices = df.prices.str.replace("      Preço abaixo do mercado", "")
df.prices = df.prices.str.replace("A partir de     ", "")
df.prices = df.prices.str.replace(".", "")

df.prices = pd.to_numeric(df.prices)

df.head()

Unnamed: 0,id,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood
0,2681785550,100,3,2,1,55000,house,Federação
1,2529929204,40-61,1-2,1-2,1-2,289000,apartment,Santa Teresa
2,2671293909,50,2,1,--,50000,house,Lobato
3,2681296717,91,2,4,2,840000,apartment,Alphaville I
4,2680287503,55,3,2,--,94890,house,Cosme de Farias


In [41]:
df.replace(" -- ", "", inplace=True)

def parse_range_and_mean(value):
    if '-' in value:
        start, end = map(int, value.split('-'))
        return (start + end) / 2
    else:
        return float(value) if value != '' else None

columns_to_transform = ['areas', 'bedrooms', 'bathrooms', 'parkingSpots']
for column in columns_to_transform:
    df[column] = df[column].apply(parse_range_and_mean)

df.head()

Unnamed: 0,id,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood
0,2681785550,100.0,3.0,2.0,1.0,55000,house,Federação
1,2529929204,50.5,1.5,1.5,1.5,289000,apartment,Santa Teresa
2,2671293909,50.0,2.0,1.0,,50000,house,Lobato
3,2681296717,91.0,2.0,4.0,2.0,840000,apartment,Alphaville I
4,2680287503,55.0,3.0,2.0,,94890,house,Cosme de Farias


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108 entries, 0 to 179
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            108 non-null    int64  
 1   areas         108 non-null    float64
 2   bedrooms      107 non-null    float64
 3   bathrooms     107 non-null    float64
 4   parkingSpots  93 non-null     float64
 5   prices        108 non-null    int64  
 6   type          108 non-null    object 
 7   neighborhood  108 non-null    object 
dtypes: float64(4), int64(2), object(2)
memory usage: 7.6+ KB
