In [358]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tqdm.notebook as tqdm

### Scraping Boliga website for data

In [361]:
import requests
from bs4 import BeautifulSoup
import re
import json

def houses_boliga(number_houses):
    """
    Returns a list of all ids for houses on boliga
    """
    house_id = list()
    url = "https://www.boliga.dk/resultat?"
    
    for i in range(int(number_houses/50)):
        new_url = url + f"?page={i}"
        response = requests.get(new_url)
        html = response.text
        soup = BeautifulSoup(html,"html.parser")
        ids = soup.find_all("a",{"class":"house-list-item"})
        link_houses = list()

        for link in ids:
            link_houses.append(re.findall("(/\d{4,}/)",link["href"])[0].replace("/",""))
        
        house_id.extend(link_houses)
        
    return house_id

def get_info(id_list):
    
    """
    Take an list with ids of houses on boliga and gets specific data about these ids
    
    """
    all_df = list()
    new_keys = ["registeredArea","downPayment","estateUrl","currentArchiveId","forSaleNowId",
                "foreclosureId","selfsaleEstateId","cleanStreet","estateId","latitude","longitude",
               "propertyType","priceChangePercentTotal","energyClass","price","rooms","size","lotSize",
               "floor","buildYear","city","isActive","municipality","zipCode","street",
                "squaremeterPrice","daysForSale","createdDate","basementSize","views"]
    
    for house_id in id_list:
        response = requests.get(f'https://api.boliga.dk/api/v2/estate/{house_id}')
        response = response.json()
        df_dict = {key: response[key] for key in new_keys}
        df = pd.DataFrame(df_dict,index=[0])
        all_df.append(df)
        
    df = pd.concat(all_df,axis=0,ignore_index=True)
    
    return df

def get_reviews(df):
    bodys = list()
    i=0
    for link in tqdm.tqdm(df["estateUrl"].values[100:110]):
        i += 1
        response = requests.get(link)
        html = response.text
        soup = BeautifulSoup(html,"html.parser")
        
        if link[8:15] =="home.dk": #Home
            ids = soup.find_all("div",{"class":"text"},"p")
            bodys.extend([x.p.text.replace("\n","").strip() for x in ids if len(x)>1])
        elif link[8:15] =="ww.skbo": #skbolig
            ids = soup.find_all("div",{"class":"listing-text"})
            bodys.extend([sk.text.replace("\n","").replace("\r","").strip() for sk in ids if len(sk)>1])
        elif link[8:15] == "www.nyb": #Nybolig
            ids = soup.find_all("div",{"class":"foldable-spot__container"})
            bodys.extend([ny.text.replace("\n","").strip() for ny in ids if len(ny)>1])
        elif link [8:15] == "ww.elto": #Eltoft Nielsen
            ids = soup.find_all("br")
            bodys.extend([elto.text.replace("\n","").strip() for elto in ids if len(elto)>1])
        elif link[8:15] == "www.cla": #Claus Borg
            ids = soup.find_all("div",{"id":"case_content"})
            bodys.extend([cla.text.replace("\n","").strip() for cla in ids if len(cla)>1])
        elif link[8:15] == "www.lok": #Lokalbolig
            ids = soup.find_all("p")
            loka = [lok.text.replace("\n","").strip() for lok in ids if len(lok.text)>100]
            bodys.extend([''.join(loka)])
        elif link[8:15] == "www.edc": #EDC Bolig
            ids = soup.find_all("div",{"class":"description"})
            bodys.extend([edc.text.replace("\n","").strip() for edc in ids if len(edc)>1])
        elif link[8:15] == "adamsch": #Adam Schnack
            ids = soup.find_all("div",{"class":"listing-text"})
            bodys.extend([adam.text.replace("\n","").strip() for adam in ids if len(adam)>1])
        elif link[8:15] == "www.est": #Estate
            ids = soup.find_all("div",{"class":"property-description"})
            bodys.extend([est.text.replace("\n","").strip() for est in ids if len(est)>1])
        elif link[8:15] == "www.bri": #Brikk Ejendomme
            ids = soup.find_all("div",{"class":"prop-user-content"})
            bodys.extend([bri.text.replace("\n","").strip() for bri in ids if len(bri)>1])
        elif link[8:15] == "www.rea": #Realmæglerne
            ids = soup.find_all("div",{"class":"text-full"})
            bodys.extend([rea.text.replace("\n","").strip() for rea in ids if len(rea)>1])
        elif link[8:15] == "danboli": #Danbolig
            ids = soup.find_all("div",{"class":"db-description-block"})
            bodys.extend([dan.text.replace("\n","").strip() for dan in ids if len(dan)>1])
        elif link[8:15] == "ww.lili": #Lillenhof
            ids = soup.find_all("div",{"class":"inner"})
            bodys.extend([dan.text.replace("\n","").strip() for dan in ids if len(dan)>10])
        elif link[8:15] == "lbaeks.": #elbæks
            bodys.append(np.nan)
        elif link[8:15] == "ww.paul":
            bodys.append(np.nan)
        elif link[8:15] == "bjornby":
            ids = soup.find_all("div",{"class":"content d-md-block d-none wrap-content"})
            bodys.extend([bjor.text.replace("\n","").strip() for bjor in ids if len(bjor)>10])
        else:
            print(link,"not in loop")
            
            
        if len(bodys) != i:
            bodys.append(np.nan)
        
    
    return bodys

def read_boliga(csv):
    """
    This function loads the dataset from boliga annd preproccesses it.
    """
    
    df = pd.read_csv(csv)
    y = np.array(df["price"])
    
    
    
    return df


In [362]:
df = read_boliga("house_data.csv")
get_reviews(df)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




['Det berømte Nyhavn er en af Københavns allerstørste turistattraktioner, hvor historiske, farvestrålende og spændende huse kan opleves, hyggelige restauranter besøges og de fascinerende både betragtes. På trods af den livlige atmosfære, gemmer der sig bag glasporten i Nyhavn 31 en skøn og rolig baggård, hvor lige præcis denne drømmelejlighed er placeret. \xa0En lille trappe sørger for, at I får jeres helt egen private indgang til den højtbeliggende stuelejlighed. I træder ind i det store leverum, hvor køkken og stue ligger i en pragtfuld, lys og åben kombination. Hvide vægge, gulve og loft samt blottede bjælker pryder indretningen, og tilfører et moderne, rå udtryk, som inviterer til sociale sammenkomster, harmoniske aftener foran brændeovnen samt lækker madlavning i det flotte køkken.\xa0Det store, indretningsvenlige soveværelse kan prale med et stort indbygget skab, som I også finder i entréen, så I ikke behøver at bekymre jer om opbevaringsplads. Det sidste stop på turen er det ele