In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tqdm.notebook as tqdm

### Scraping Boliga website for data

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import json

def houses_boliga(number_houses):
    """
    Returns a list of all ids for houses on boliga
    """
    house_id = list()
    url = "https://www.boliga.dk/resultat"
    
    for i in range(int(number_houses/50)):
        new_url = url + f"?page={i}"
        response = requests.get(new_url)
        html = response.text
        soup = BeautifulSoup(html,"html.parser")
        ids = soup.find_all("a",{"class":"house-list-item"})
        link_houses = list()

        for link in ids:
            link_houses.append(re.findall("(/\d{4,}/)",link["href"])[0].replace("/",""))
        
        house_id.extend(link_houses)
        
    return house_id

def get_info(id_list):
    
    """
    Take an list with ids of houses on boliga and gets specific data about these ids
    
    """
    all_df = list()
    new_keys = ["registeredArea","downPayment","estateUrl","currentArchiveId","forSaleNowId",
                "foreclosureId","selfsaleEstateId","cleanStreet","estateId","latitude","longitude",
               "propertyType","priceChangePercentTotal","energyClass","price","rooms","size","lotSize",
               "floor","buildYear","city","isActive","municipality","zipCode","street",
                "squaremeterPrice","daysForSale","createdDate","basementSize","views"]
    
    for house_id in id_list:
        response = requests.get(f'https://api.boliga.dk/api/v2/estate/{house_id}')
        response = response.json()
        df_dict = {key: response[key] for key in new_keys}
        df = pd.DataFrame(df_dict,index=[0])
        all_df.append(df)

    df = pd.concat(all_df,axis=0,ignore_index=True)
    
    return df

def get_reviews(df):
    bodys = list()
    i=0
    for link in tqdm.tqdm(df["estateUrl"].values[50:70]):
        i += 1
        response = requests.get(link)
        html = response.text
        soup = BeautifulSoup(html,"html.parser")
        
        if link[8:15] =="home.dk": #Home
            ids = soup.find_all("div",{"class":"text"},"p")
            bodys.extend([x.p.text.replace("\n","").strip().lower() for x in ids if len(x)>1])
        elif link[8:15] =="ww.skbo": #skbolig
            ids = soup.find_all("div",{"class":"listing-text"})
            bodys.extend([sk.text.replace("\n","").replace("\r","").strip().lower() for sk in ids if len(sk)>1])
        elif link[8:15] == "www.nyb": #Nybolig
            ids = soup.find_all("div",{"class":"foldable-spot__container"})
            bodys.extend([ny.text.replace("\n","").strip().lower() for ny in ids if len(ny)>1])
        elif link [8:15] == "ww.elto": #Eltoft Nielsen
            ids = soup.find_all("br")
            bodys.extend([elto.text.replace("\n","").strip().lower() for elto in ids if len(elto)>1])
        elif link[8:15] == "www.cla": #Claus Borg
            ids = soup.find_all("div",{"id":"case_content"})
            bodys.extend([cla.text.replace("\n","").strip().lower() for cla in ids if len(cla)>1])
        elif link[8:15] == "www.lok": #Lokalbolig
            ids = soup.find_all("p")
            loka = [lok.text.replace("\n","").strip().lower() for lok in ids if len(lok.text)>100]
            bodys.extend([''.join(loka)])
        elif link[8:15] == "www.edc": #EDC Bolig
            ids = soup.find_all("div",{"class":"description"})
            bodys.extend([edc.text.replace("\n","").strip().lower() for edc in ids if len(edc)>1])
        elif link[8:15] == "adamsch": #Adam Schnack
            ids = soup.find_all("div",{"class":"listing-text"})
            bodys.extend([adam.text.replace("\n","").strip().lower() for adam in ids if len(adam)>1])
        elif link[8:15] == "www.est": #Estate
            ids = soup.find_all("div",{"class":"property-description"})
            bodys.extend([est.text.replace("\n","").strip().lower() for est in ids if len(est)>1])
        elif link[8:15] == "www.bri": #Brikk Ejendomme
            ids = soup.find_all("div",{"class":"prop-user-content"})
            bodys.extend([bri.text.replace("\n","").strip().lower() for bri in ids if len(bri)>1])
        elif link[8:15] == "www.rea": #Realmæglerne
            ids = soup.find_all("div",{"class":"text-full"})
            bodys.extend([rea.text.replace("\n","").strip().lower() for rea in ids if len(rea)>1])
        elif link[8:15] == "danboli": #Danbolig
            ids = soup.find_all("div",{"class":"db-description-block"})
            bodys.extend([dan.text.replace("\n","").strip().lower() for dan in ids if len(dan)>1])
        elif link[8:15] == "ww.lili": #Lillenhof
            ids = soup.find_all("div",{"class":"inner"})
            bodys.extend([dan.text.replace("\n","").strip().lower() for dan in ids if len(dan)>10])
        elif link[8:15] == "lbaeks.": #elbæks
            bodys.append(np.nan)
        elif link[8:15] == "ww.paul":
            bodys.append(np.nan)
        elif link[8:15] == "bjornby":
            ids = soup.find_all("div",{"class":"content d-md-block d-none wrap-content"})
            bodys.extend([bjor.text.replace("\n","").strip() for bjor in ids if len(bjor)>10])
        else:
            print(link,"not in loop")
            
            
        if len(bodys) != i:
            bodys.append(np.nan)
        
    
    return bodys

def preprocess_csv(csv):
    """
    This function loads the dataset from boliga annd preproccesses it.
    """
    df = pd.read_csv(csv)
    y = np.array(df["price"])
    
    
    
    return df


def preprocess_text():
    
    
    return None


In [3]:
df = preprocess_csv("house_data.csv")
len(np.array(df["estateUrl"].values))

52093

In [4]:
location = list()
for lon,lat in zip(df["longitude"],df["latitude"]):
    location.append((lon,lat))
    
df["location"] = location

In [5]:
df_station = pd.read_csv("stationsdata")
df_station.head()

Unnamed: 0.1,Unnamed: 0,lon,lat
0,0,55.658215,8.73094
1,1,55.751927,8.717152
2,2,54.937063,9.612413
3,3,55.429704,10.488886
4,4,55.602028,8.805512


In [6]:
from geopy.distance import geodesic
from tqdm.notebook import tqdm
min_dist = list()

for location in tqdm(df["location"]):
    distance = list()
    for lon,lat in zip(df_station["lon"],df_station["lat"]):
        distance.append(geodesic((lat,lon), location).km)
    
    min_dist.append(min(distance))
    
min_dist

HBox(children=(FloatProgress(value=0.0, max=52093.0), HTML(value='')))




[1.7360179575421781,
 1.7599107466485626,
 1.6985941601752452,
 1.576182942534151,
 1.6952297456120298,
 1.697058381663335,
 1.572494800989399,
 1.595909022264598,
 1.6614126836525682,
 1.523151624363795,
 1.5582461287835425,
 1.5825878020282191,
 1.4602834433598806,
 1.4808322446922364,
 1.4602834433598806,
 2.197738801584652,
 2.093570552651923,
 2.1600863573892317,
 2.1600863573892317,
 2.1506699430361045,
 2.1589535910364113,
 2.153088474676,
 2.195304741731849,
 2.319037798241087,
 2.3458043652669662,
 2.3143120623545452,
 2.3523227854318307,
 2.5280565473892165,
 2.4651726975241197,
 2.5717946755367005,
 2.6858365156487336,
 2.4236319640383512,
 2.47452348266927,
 2.47452348266927,
 2.47452348266927,
 2.4561308046660084,
 2.2819290385818327,
 2.2819290385818327,
 2.489592789123424,
 2.3453584032827917,
 2.490609018188982,
 2.3848268571176425,
 2.5256564209339114,
 2.744255721439346,
 2.5530839971253205,
 2.5637011508386207,
 2.5646551978623915,
 2.5726924304659797,
 2.57635794250

In [7]:
df["dist_station"] = min_dist
df.to_csv("house_data_staion.csv")

In [11]:
from nltk.corpus import wordnet as wn
wn.synsets("hund", lang="dan")

[Synset('dog.n.01')]

In [12]:
synonyms = []
antonyms = []
for syn in wordnet.synsets("hund",lang="dan"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

In [13]:
synonyms

['dog', 'domestic_dog', 'Canis_familiaris']

In [8]:
def preprocess_text(string):

   

    #Removes all, if any, html tags

    cleanr = re.compile("<.*?>")

    string = re.sub(cleanr,"",string)

   

    #Removes everything but words

    cleanword = re.compile('[^a-zA-Z]')

    string = re.sub(cleanword,"",string)

   

    #Lemmatizer strings

    lemmatizer = lemmy.load("da")

    string = lemmatizer.lemmatize("",string)

   

    return string

 

from collections import Counter

 

def words_count(list_of_strings):

    counts = dict()

    sentences = list_of_strings


    for word in sentences:

        if word in counts.keys:

            counts[word] += 1

        else:

            counts[word] = 1

           

    return counts