In [1]:
import ast 

def read_data(file_path):
    data = []
    try:
        with open(file_path) as f:
            for line in f:
                data.append(ast.literal_eval(line.replace('\\u00e3', 'ã').replace('\\u00e9', 'é').replace('\\u00e2', 'â') \
                            .replace('\\u00e1', 'á').replace('\\u00ed', 'í').replace('\\u00f3', 'ó') \
                            .replace('\\u00b2', '2').replace('\\u00f4', 'ô').replace('\\u00ea', 'ê') \
                            .replace('\\u00e7', 'ç').replace('\\u00f5', 'õ').replace('\\u00fa', 'ú') \
                            .replace('\\u00aa', '°').replace('\\u00e0', 'à').replace('\\u00c1', 'Á') \
                            .replace('\\u00c9', 'É').replace('\\u00d3', 'Ó').replace('\\u00ca', 'Ê') \
                            .replace('\\u00ba', '°').replace('\\u00cd', 'Í').replace('\\u00b0', '°') \
                            .replace('\\u00fce','ü').replace('\\u00c2', 'Ã').replace('\\u00c7', 'Ç') \
                            .replace('\\"', '').replace('\\u00d4', 'Ô').replace('\\u00b4', '´') \
                            .replace('\\u00c3', 'Ã').replace('\n', '').replace('null', '"R$ 0"')))
    except IOError:
        print("File not accessible")
        return []
       
    return data

In [2]:
# Calcular similaridade:
import Levenshtein
import pandas as pd

goiania_neighborhood = pd.read_csv("/home/marcos/Documents/Projetos/analise-preco-imoveis/imoveis_crawling/files/bairros_goiania.csv")
neighborhood_list = [i for i in goiania_neighborhood['Bairro']]

def best_neighborhood(string):
    best_distance = 100
    best_neighborhood = ''
    
    string = string.replace('Setor ', '')
    
    for neighborhood in neighborhood_list:
        distance = Levenshtein.distance(string, neighborhood)
        if distance == 1:
            return neighborhood
        elif distance < best_distance:
            best_distance = distance
            best_neighborhood = neighborhood
            
    return best_neighborhood if best_distance < 5 else None

### Processamento:

In [3]:
from os import listdir
from os.path import isfile, join


def get_data(path):
    vetor_processed = {}
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]

    for file in onlyfiles:
        day = file[11:21]
        vetor = read_data(path + file)

        for line in vetor:
            #Calcular a data de retirada do anuncio:
            if line['id'] in vetor_processed:
                vetor_processed[line['id']][5] = day
            else:
                vetor_processed[line['id']] = [line['titulo'], \
                                               line['regiao'].split(', '), \
                                               line['detalhes'], \
                                               line['preco'].replace(' ', '').split('R$')[1], \
                                               line['data_de_publicacao'] if line['data_de_publicacao'] != 'Hoje' and line['data_de_publicacao'] != 'Ontem' else str(day),\
                                               day]
    return vetor_processed

vetor_processed = get_data("/home/marcos/Documents/Projetos/analise-preco-imoveis/imoveis_crawling/files/imoveis/aluguel/")
vetor_processed_venda = get_data("/home/marcos/Documents/Projetos/analise-preco-imoveis/imoveis_crawling/files/imoveis/venda/")

In [89]:
print(vetor_processed_venda['634457544'])

['Vendo uma casa no bairro boa vista', ['Goiânia', 'Boa Vista'], ' 2 quartos | 350 m2 | 2 vagas', '110.000', '19-12-20.t', '19-12-22.t']


### Mapa que mostra valor médio dos imóveis por região

In [5]:
import pandas as pd
import plotly.express as px
import numpy as np


def average_value_map(vetor_processed, min_price, max_price):
    goiania_neighborhood = pd.read_csv("/home/marcos/Documents/Projetos/analise-preco-imoveis/imoveis_crawling/files/bairros_goiania.csv")
    neighborhood_list = { neighborhood : [0, 0, latitude, longitude] for neighborhood, latitude, longitude 
                         in zip(goiania_neighborhood['Bairro'], goiania_neighborhood['Latitude'], goiania_neighborhood['Longitude'])}
    neighborhood_map = []

    for line in vetor_processed:
        neighborhood = vetor_processed[line][1][1] if len(vetor_processed[line][1]) > 1 and vetor_processed[line][1][0] == "Goiânia" else None
        if neighborhood is not None:
            n = best_neighborhood(neighborhood)

            # Match com bairro que consta o valor do imovel e esta dentro do range:
            if n != None and max_price > int(vetor_processed[line][3].replace('.', '')) > min_price:
                neighborhood_list[n][0] +=1
                neighborhood_list[n][1] += int(vetor_processed[line][3].replace('.', ''))


    for index in neighborhood_list:
        if neighborhood_list[index][0] != 0:
            neighborhood_map.append([index, 
                                     neighborhood_list[index][2],
                                     neighborhood_list[index][3],
                                     neighborhood_list[index][1]/neighborhood_list[index][0],
                                     neighborhood_list[index][0]])    

    myData = np.array( neighborhood_map)

    df=pd.DataFrame(data=myData,index=[i for i in range(len(myData))],columns=['Bairro','Latitude','Longitude','PrecoMedio','Size'])

    df['Latitude'] = df['Latitude'].astype('float')
    df['Longitude'] = df['Longitude'].astype('float')
    df['PrecoMedio'] = df['PrecoMedio'].astype('float')
    df['Size'] = df['Size'].astype('int')

    fig = px.scatter_mapbox(df, lat="Latitude", lon="Longitude", hover_name="Bairro", color="PrecoMedio",
                            size="Size", color_discrete_sequence= px.colors.sequential.Plasma[-2::-1], zoom=11, height=600, size_max=40)

    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.show()
    
# average_value_map(vetor_processed_venda, 50000, 5000000)
average_value_map(vetor_processed, 100, 8000)

### Quantidade de imóveis por faixa de valor

#### Formato do vetor:
#### {'ID' : [ titulo, [cidade, bairro], descicao, preco, data_publicacao, tempo_no_ar] }

In [6]:
import plotly.graph_objects as go

def plot_value_range(vetor_processed, max_value, shift):
    precos_x = [i for i in range(0,max_value + shift, shift)]
    precos_y = [0 for i in range(len(precos_x))]

    for line in vetor_processed:
        preco = float(vetor_processed[line][3].replace('.', ''))
        precos_y[len(precos_x) - 1 if preco > max_value else int(preco // shift)]+= 1

    fig = go.Figure([go.Bar(x=precos_x, y=precos_y)])
    fig.show()
    
plot_value_range(vetor_processed, 2500, 100)
# plot_value_range(vetor_processed_venda, 3000000, 50000)

### Tempo médio que o anúncio ficou na plataforma:

In [7]:
import plotly.graph_objects as go
from datetime import datetime


def average_time_in_platform(vetor_processed, max_time):
    tempo_x = [i for i in range(0, max_time + 1)]
    tempo_y = [0 for i in range(len(tempo_x))]

    for line in vetor_processed:
        if len(vetor_processed[line][4]) > 8:
            d1 = datetime.strptime(vetor_processed[line][5], "%Y-%m-%d")
            d2 = datetime.strptime(vetor_processed[line][4], "%Y-%m-%d")
            time_diff = abs((d2 - d1).days)
            if time_diff <= max_time:
                tempo_y[time_diff]+=1 
            else:
                tempo_y[max_time]+=1

    fig = go.Figure([go.Bar(x=tempo_x, y=tempo_y)])
    fig.show()
    
average_time_in_platform(vetor_processed, 20)
# average_time_in_platform(vetor_processed_venda, 20)

In [61]:
import re
import plotly.graph_objects as go

def type_of_property(vetor_processed):
    tipo = {"apartamento" : "Apartamento",
            "apt" : "Apartamento",
           "casa" : "Casa",
           "kitnet" : "Kitnet",
           "studio" : "Kitnet",
           "quit" : "Kitnet",
           "kit" : "Kitnet",
           "barracão" : "Barracão",
           "barracao" : "Barracão"}

    count = {"Apartamento": 0,
            "Casa" : 0,
            "Kitnet": 0,
            "Barracão" : 0}

    p = re.compile('(?:apartamento|casa|kitnet|apt|studio|quit|kit|barracão|barracao)', flags=re.IGNORECASE)

    for line in vetor_processed:
        a = p.search(vetor_processed[line][0])
        if a:
            count[tipo[a.group(0).lower()]]+=1

    fig = go.Figure(data=[go.Pie(labels=[i for i in count], values=[count[i] for i in count], textinfo='label+percent',
                                 insidetextorientation='radial'
                                )])
    fig.show()
    
type_of_property(vetor_processed)