In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.distance import geodesic
import tqdm.notebook as tqdm
from collections import Counter
import lemmy

### Scraping Boliga website for data

In [13]:
import requests
from bs4 import BeautifulSoup
import re
import json

def houses_boliga(number_houses):
    """
    Returns a list of all ids for houses on boliga
    """
    house_id = list()
    url = "https://www.boliga.dk/resultat"
    
    for i in range(int(number_houses/50)):
        new_url = url + f"?page={i}"
        response = requests.get(new_url)
        html = response.text
        soup = BeautifulSoup(html,"html.parser")
        ids = soup.find_all("a",{"class":"house-list-item"})
        link_houses = list()

        for link in ids:
            link_houses.append(re.findall("(/\d{4,}/)",link["href"])[0].replace("/",""))
        
        house_id.extend(link_houses)
        
    return house_id

def get_info(id_list):
    
    """
    Take an list with ids of houses on boliga and gets specific data about these ids
    
    """
    all_df = list()
    new_keys = ["registeredArea","downPayment","estateUrl","currentArchiveId","forSaleNowId",
                "foreclosureId","selfsaleEstateId","cleanStreet","estateId","latitude","longitude",
               "propertyType","priceChangePercentTotal","energyClass","price","rooms","size","lotSize",
               "floor","buildYear","city","isActive","municipality","zipCode","street",
                "squaremeterPrice","daysForSale","createdDate","basementSize","views"]
    
    for house_id in id_list:
        response = requests.get(f'https://api.boliga.dk/api/v2/estate/{house_id}')
        response = response.json()
        df_dict = {key: response[key] for key in new_keys}
        df = pd.DataFrame(df_dict,index=[0])
        all_df.append(df)

    df = pd.concat(all_df,axis=0,ignore_index=True)
    
    return df

def get_reviews(df):
    bodys = list()
    #Finder alle ejendomsmæglere, som har mere end 100 huse til salg
    for value in df["estateUrl"].values:
        estates.append(value[8:15])
    numbers = dict(Counter(estates))
    over_100 = dict() 
    for key, value in numbers.items():
        if value > 100:
            over_100[key] = value
    #Kører igennem alle links og finder tilhørende beskrivelse
    for link in tqdm.tqdm(df["estateUrl"].values):
        try:
            response = requests.get(link)
            html = response.text
            soup = BeautifulSoup(html,"html.parser")
            
            if link[8:15] =="home.dk": #Home
                ids = soup.find_all("div",{"class":"text"},"p")
                bodys.extend([x.p.text.replace("\n","").strip().lower() for x in ids[0:1] if len(x)>1])
            elif link[8:15] =="ww.skbo": #skbolig
                ids = soup.find_all("div",{"class":"listing-text"})
                bodys.extend([sk.text.replace("\n","").replace("\r","").strip().lower() for sk in ids[0:1] if len(sk)>1])
            elif link[8:15] == "www.nyb": #Nybolig
                ids = soup.find_all("div",{"class":"foldable-spot__container"})
                bodys.extend([ny.text.replace("\n","").strip().lower() for ny in ids[0:1] if len(ny)>1])
            elif link [8:15] == "ww.elto": #Eltoft Nielsen
                ids = soup.find_all("br")
                bodys.extend([elto.text.replace("\n","").strip().lower() for elto in ids[0:1] if len(elto)>1])
            elif link[8:15] == "www.cla": #Claus Borg
                ids = soup.find_all("div",{"id":"case_content"})
                bodys.extend([cla.text.replace("\n","").strip().lower() for cla in ids[0:1] if len(cla)>1])
            elif link[8:15] == "www.lok": #Lokalbolig
                ids = soup.find_all("p")
                loka = [lok.text.replace("\n","").strip().lower() for lok in ids if len(lok.text)>100]
                bodys.extend([''.join(loka)])
            elif link[8:15] == "www.edc": #EDC Bolig
                ids = soup.find_all("div",{"class":"description"})
                bodys.extend([edc.text.replace("\n","").strip().lower() for edc in ids[0:1] if len(edc)>1])
            elif link[8:15] == "adamsch": #Adam Schnack
                ids = soup.find_all("div",{"class":"listing-text"})
                bodys.extend([adam.text.replace("\n","").strip().lower() for adam in ids[0:1] if len(adam)>1])
            elif link[8:20] == "www.estate.d": #Estate
                ids = soup.find_all("div",{"class":"property-description"})
                bodys.extend([est.text.replace("\n","").strip().lower() for est in ids[0:1] if len(est)>1])
            elif link[8:15] == "www.bri": #Brikk Ejendomme
                ids = soup.find_all("div",{"class":"prop-user-content"})
                bodys.extend([bri.text.replace("\n","").strip().lower() for bri in ids[0:1] if len(bri)>1])
            elif link[8:15] == "www.rea": #Realmæglerne
                ids = soup.find_all("div",{"class":"text-full"})
                bodys.extend([rea.text.replace("\n","").strip().lower() for rea in ids[0:1] if len(rea)>1])
            elif link[8:15] == "danboli": #Danbolig
                ids = soup.find_all("div",{"class":"db-description-block"})
                bodys.extend([dan.text.replace("\n","").strip().lower() for dan in ids[0:1] if len(dan)>1])
            elif link[8:15] == "ww.lili": #Lillenhof
                ids = soup.find_all("div",{"class":"inner"})
                bodys.extend([dan.text.replace("\n","").strip().lower() for dan in ids[0:1] if len(dan)>10])
            elif link[8:15] == "bjornby":
                ids = soup.find_all("div",{"class":"content d-md-block d-none wrap-content"})
                bodys.extend([bjor.text.replace("\n","").strip() for bjor in ids[0:1] if len(bjor)>10])
            elif link[8:15] == 'www.hov': #Hovmand
                ids = soup.find_all("div",{"class":"column"})
                bodys.extend([hov.text.replace("\n","").strip() for hov in ids[0:1] if len(hov)>1])
            elif link[8:15] == 'ww.jesp': #Jesper Nielsen
                ids = soup.find_all("div",{"class":"case-description"})
                bodys.extend([jesp.text.replace("\n","").strip() for jesp in ids[0:1] if len(jesp)>1])
            elif link[8:15] == "www.sel": #Selvsalg
                ids = soup.find_all("div",{"class":"tab-pane active fade in"})
                bodys.extend([selv.text.replace("\n","").strip() for selv in ids[0:1] if len(selv)>1])
            elif link[8:15] == "www.bol": #Bolig
                ids = soup.find_all("div",{"class":"description col-md-16"})
                bodys.extend([bol.text.replace("\n","").strip() for bol in ids[0:1] if len(bol)>1])
            elif link[8:15] == 'www.joh': #Johns
                ids = soup.find_all("div",{"class":"column"})
                bodys.extend([john.text.replace("\n","").strip() for john in ids[0:1] if len(john)>1])
            elif link[8:15] == "racking": #Robinhus
                ids = soup.find_all("div",{"class":"text-container"})
                bodys.extend([robin.text.replace("\n","").strip() for robin in ids[0:1] if len(robin)>1])
            elif link[8:15] == "www.min": #minbolighandel
                ids = soup.find_all("div",{"class":"description col-md-16"})
                bodys.extend([minb.text.replace("\n","").strip() for minb in ids[0:1] if len(minb)>1])
            elif link[8:15] == "ww.unni": #Unnibolig
                ids = soup.find_all("div",{"class":"column"})
                bodys.extend([un.text.replace("\n","").strip() for un in ids[0:1] if len(un)>1])
            elif link[8:15] == "www.sdb": #Sdb bolig
                ids = soup.find_all("div",{"class":"column"})
                bodys.extend([un.text.replace("\n","").strip() for un in ids[0:1] if len(un)>1])
            elif link[8:15] == "ww.land":#Landobolig
                ids = soup.find_all("div",{"class":"col-md-8"})
                bodys.extend([land.text.replace("\n","").strip() for land in ids[0:1] if len(land)>1])
            elif link[8:15] == "www.ber": #Bermistof
                ids = soup.find_all("div",{"class":"column"})
                bodys.extend([ber.text.replace("\n","").strip() for ber in ids[0:1] if len(ber)>1])
            elif link [8:20] == 'www.carlsber': #Carlsberg Byen
                ids = soup.find_all("div",{"itemprop":"description"})
                bodys.extend([car.text.replace("\n","").strip() for car in ids[0:1] if len(car)>1])
            elif link[8:15] == "www.car": #Carsten Nordbo
                ids = soup.find_all("div",{"class":"description col-md-16"})
                bodys.extend([car.text.replace("\n","").strip() for car in ids[0:1] if len(car)>1])
            elif link[8:15] == 'ww.agri': 
                ids = soup.find_all("div",{"class":"col-md-8 col-sm-7 hidden-xs text-box desktop"})
                bodys.extend([agr.text.replace("\n","").strip() for agr in ids[0:1] if len(agr)>1])
            elif link[8:15] == "www.pla":#Place2Live
                ids = soup.find_all("div",{"class":"col-lg-16"})
                bodys.extend([pla.text.replace("\n","").strip() for pla in ids[0:1] if len(pla)>1])
            elif link[8:15] == "www.vil": #Villadsenbolig
                ids = soup.find_all("div",{"class":"description col-md-16"})
                bodys.extend([vil.text.replace("\n","").strip() for vil in ids[0:1] if len(vil)>1])
            elif link[8:15] == 'maegler': #Mæglerhuset
                ids = soup.find_all("div",{"class":"case-text"})
                bodys.extend([mae.text.replace("\n","").strip() for mae in ids[0:1] if len(mae)>1])
            elif link[8:15] == 'ww.thom': #ThomasJørgensen
                ids = soup.find_all("div",{"class":"description col-md-16"})
                bodys.extend([thom.text.replace("\n","").strip() for thom in ids[0:1] if len(thom)>1])
            elif link[8:15] == 'www.htb': #HTbolig
                ids = soup.find_all("div",{"class":"left-side global-style"})
                bodys.extend([htb.text.replace("\n","").strip() for htb in ids[0:1] if len(htb)>1])
            elif link[8:15] == 'ww.boli': #Boligone
                ids = soup.find_all("div",{"class":"first-col"})
                bodys.extend([bol.text.replace("\n","").strip() for bol in ids[0:1] if len(bol)>1])
            elif link[8:15] == "www.mæg":#Mæglerringen
                ids = soup.find_all("div",{"class":"first-col"})
                bodys.extend([ma.text.replace("\n","").strip() for ma in ids[0:1] if len(ma)>1])
            elif link[8:15] == "ww.vest":
                ids = soup.find_all("div",{"class":"first-col"})
                bodys.extend([vest.text.replace("\n","").strip() for vest in ids[0:1] if len(vest)>1])
            elif link[8:15] == "www.tho": #Thorregård
                ids = soup.find_all("div",{"class":"annonce rammebaggrund"})
                bodys.extend([th.text.replace("\n","").strip() for th in ids[0:1] if len(th)>1])
            elif link[8:15] == "byggegr": #Byggegrund
                ids = soup.find_all("div",{"class":"section section-12"})
                bodys.extend([byg.text.replace("\n","").strip() for byg in ids[0:1] if len(byg)>1])
            elif link[8:15] == "grundsa": #Grundsalg
                bodys.append(np.nan)
            elif link[8:15] == "rundsal": #Grundsalg
                bodys.append(np.nan)
            elif link[8:15] =="ww.paul": #paulun
                bodys.append(np.nan)
            else:
                bodys.append(np.nan)
                if link[8:15] in over_100:
                    print("Missing", link[8:15])
        except:
            print(link,"virkede ikke")
            continue
    
    return bodys

def find_realtors(df):
    """
    This function finds all realtors, who has more that 100 houses for sale.
    Used to find the structure for all realtors of relevance
    """
    realtors_link = list()
    #Finder alle ejendomsmæglere, som har mere end 100 huse til salg
    estates = list()
    for value in df["estateUrl"].values:
        estates.append(value[8:15])
    numbers = dict(Counter(estates))

    over_100 = dict() 
    for key, value in numbers.items():
        if value > 100:
            over_100[key] = value
    already_accounted = list()
    for link in tqdm.tqdm(df["estateUrl"].values):        
        if link[8:15] in over_100.keys():
            if link[8:15] not in already_accounted:
                print(link,"not in loop")
                print(link[8:15])
                realtors_link.append(link[8:15])
                already_accounted.append(link[8:15])

    return realtors_link
    
def preprocess_csv(csv):
    """
    This function loads the dataset from boliga annd preproccesses it.
    """
    df = pd.read_csv(csv)
    y = np.array(df["price"])
    
    return df

def preprocess_text(string):
    lemmatizer = lemmy.load("da")
    string = lemmatizer.lemmatize("",string)[0].split()

    return string

def words_count(list_of_strings):
    sentences = list_of_strings
    counts = dict(Counter(sentences))
    return counts

def add_lonlat(df,df_station):
    min_dist = list()

    for location in tqdm(df["location"]):
        distance = list()
        for lon,lat in zip(df_station["lon"],df_station["lat"]):
            distance.append(geodesic((lat,lon), location).km)
    
        min_dist.append(min(distance))
        
    df["dist_station"] = min_dist
    return df

In [16]:
df = preprocess_csv("house_data.csv")
len(np.array(df["estateUrl"].values))

52093

In [7]:
df["dist_station"] = min_dist
df.to_csv("house_data_staion.csv")

In [17]:
find_realtors(df)

HBox(children=(FloatProgress(value=0.0, max=52093.0), HTML(value='')))

https://home.dk/boligkatalog/koebenhavn/1051/ejerlejligheder/nyhavn_31b_st_1480000269.aspx not in loop
home.dk
https://www.nybolig.dk/ejerlejlighed/1051/nyhavn/260405/msz5248 not in loop
www.nyb
https://www.lokalbolig.dk?sag=36-X0000157&mgl=18036 not in loop
www.lok
http://www.eltoftnielsen.dk/default.aspx?sagsnr=IEN6850&mgl=2429&DID=135&udbudsform=salg not in loop
ww.elto
https://www.edc.dk/alle-boliger/k%C3%B8benhavn-k/1063/laksegade-26-3-th/?sagsnr=11303124 not in loop
www.edc
https://www.estate.dk/ejerlejlighed/1068/nikolajgade/270210/ec131 not in loop
www.est
https://www.brikk.dk/ejendom/groennegade-43-1107-koebenhavn-k/ not in loop
www.bri
https://www.realmaeglerne.dk/bolig/148-0384-gothersgade-152-4-th not in loop
www.rea
https://danbolig.dk/bolig/koebenhavn/1129/ejerlejlighed/087ehb03745-087 not in loop
danboli
http://www.lilienhoff.dk/redirect.htm?sag=60000128&mgl=1976&DID=135&udbudsform=salg not in loop
ww.lili
http://www.paulun.dk/bolig-redirect?sagsnr=CI-MH087&mgl=2647&DID=

['home.dk',
 'www.nyb',
 'www.lok',
 'ww.elto',
 'www.edc',
 'www.est',
 'www.bri',
 'www.rea',
 'danboli',
 'ww.lili',
 'ww.paul',
 'www.sel',
 'www.car',
 'www.bol',
 'racking',
 'ww.boli',
 'ww.itva',
 'www.min',
 'ww.vest',
 'www.tho',
 'ww.thom',
 'www.pla',
 'ww.land',
 'www.joh',
 'www.mæg',
 'www.mik',
 'ww.agri',
 'grundsa',
 'www.mae',
 'rundsal',
 'byggegr',
 'www.htb',
 'maegler',
 'www.vil',
 'ww.calu']

In [117]:
"https://www.estate.dk/villa/5683/kildevej/270085/115819"[8:20]

'www.estate.d'

In [118]:
test ="https://www.estaldo.com/listing/c528a1bcf9b947a29fec7415d1087451"
test[8:20]

'www.estaldo.'

In [156]:
bodys = get_reviews(df)
bodys

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




['i det smukke grejsdalen finder i denne skønne villa, på højtbeliggende grund med god udsigt til skoven som omkranser hele området. grejsdalen byder bl.a. på skole, sportsforening, og så er der ikke langt til vejle centrum. motorvejsnettet ligger heller ikke langt væk, man skal blot køre op igennem grejs, og så er man ved motorvejen.\xa0villaen byder på hele 197 kvadratmeter bolig, fordelt på to plan. i stueplan træder i ind i et stort og lækkert bryggers med god plads til vaskemaskine og tørretumbler, samt god garderobeplads.\xa0herfra træder man ind i husets egentlige entre, hvorfra der er adgang til husets dejligt store badeværelse. fra entreen går man videre ind i fordelingsgangen hvor der er gæstetoilet, og trappe til første sal.\xa0forbi trappen træder man ind i køkkenet, som er i fantastisk god åben forbindelse til alrum, og også til den store lyse stue som ligger i forlængelse af alrummet.\xa0på første sal kommer i op til endnu en stue, og her er der også udgang til altan. her

In [11]:
over_100

NameError: name 'over_100' is not defined

In [157]:
all_text = list()
for string in bodys[0:3]:
    all_text.extend(preprocess_text(string))
    
test_counts = words_count(all_text)

In [158]:
test_counts

{'i': 31,
 'det': 5,
 'smukke': 1,
 'grejsdalen': 2,
 'finder': 3,
 'denne': 4,
 'skønne': 2,
 'villa,': 2,
 'på': 12,
 'højtbeliggende': 1,
 'grund': 1,
 'med': 15,
 'god': 7,
 'udsigt': 1,
 'til': 18,
 'skoven': 2,
 'som': 6,
 'omkranser': 1,
 'hele': 4,
 'området.': 2,
 'byder': 2,
 'bl.a.': 1,
 'skole,': 1,
 'sportsforening,': 1,
 'og': 31,
 'så': 5,
 'er': 25,
 'der': 9,
 'ikke': 5,
 'langt': 3,
 'vejle': 5,
 'centrum.': 1,
 'motorvejsnettet': 1,
 'ligger': 4,
 'heller': 1,
 'væk,': 1,
 'man': 15,
 'skal': 2,
 'blot': 1,
 'køre': 1,
 'op': 3,
 'igennem': 2,
 'grejs,': 1,
 'ved': 2,
 'motorvejen.': 1,
 'villaen': 1,
 '197': 1,
 'kvadratmeter': 1,
 'bolig,': 1,
 'fordelt': 2,
 'to': 3,
 'plan.': 3,
 'stueplan': 4,
 'træder': 3,
 'ind': 4,
 'et': 11,
 'stort': 1,
 'lækkert': 1,
 'bryggers': 1,
 'plads': 2,
 'vaskemaskine': 1,
 'tørretumbler,': 1,
 'samt': 8,
 'garderobeplads.': 1,
 'herfra': 1,
 'husets': 4,
 'egentlige': 1,
 'entre,': 1,
 'hvorfra': 1,
 'adgang': 3,
 'dejligt': 1,
 

In [52]:
df["estateUrl"].values[201]

'https://home.dk/boligkatalog/koebenhavn/1428/ejerlejligheder/andreas_bjoerns_gade_8_1_th_1160000295.aspx'

In [20]:

link = "http://www.paulun.dk/bolig-redirect?sagsnr=CI-MH087&mgl=2647&DID=135&udbudsform=salg"

response = requests.get(link)
html = response.text
soup = BeautifulSoup(html,"html.parser")
ids = soup.find_all("section",{"class":"description boligInfo innergrid left"})
print(len(ids))
for p in ids[0:1]:
    print(p.text)


0


In [217]:
link[8:15]

'ww.boli'

In [21]:
html

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> \n<html xmlns="http://www.w3.org/1999/xhtml"> \n<head> \n<title>IIS 10.0 Detailed Error - 406.0 - ModSecurity Action</title> \n<style type="text/css"> \n<!-- \nbody{margin:0;font-size:.7em;font-family:Verdana,Arial,Helvetica,sans-serif;} \ncode{margin:0;color:#006600;font-size:1.1em;font-weight:bold;} \n.config_source code{font-size:.8em;color:#000000;} \npre{margin:0;font-size:1.4em;word-wrap:break-word;} \nul,ol{margin:10px 0 10px 5px;} \nul.first,ol.first{margin-top:5px;} \nfieldset{padding:0 15px 10px 15px;word-break:break-all;} \n.summary-container fieldset{padding-bottom:5px;margin-top:4px;} \nlegend.no-expand-all{padding:2px 15px 4px 10px;margin:0 0 0 -12px;} \nlegend{color:#333333;;margin:4px 0 8px -12px;_margin-top:0px; \nfont-weight:bold;font-size:1em;} \na:link,a:visited{color:#007EFF;font-weight:bold;} \na:hover{text-decoration:none;} \nh1{font-size:2.4em;margin:0;