## TODOS

- Impute values where #rooms is missing
- Add more features
- Add data exploration

inspo -> https://towardsdatascience.com/web-scraping-apartment-listings-in-stockholm-3fcebacf8be6

In [4]:
# imports
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm as tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pickle

In [48]:
# Retrieving the hrefs of the links in the page
url = 'https://www.booli.se/slutpriser/stockholm/1?page=1'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, 'html.parser')
select = soup.find('div', class_="_2m6km uC2y2 _3oDFL")
links = select.find_all('a')
hrefs = [link.get('href') for link in links]

In [49]:
# function to calculate number of pages that can be scraped
def get_nr_of_pages(url):
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.text, 'html.parser')
    select = soup.find('div', class_="EuKIv _36W0F")
    nr_of_objects = select.find('span').text.split(' ')[3]
    nr_of_pages = int(nr_of_objects) // 35 + 1
    return nr_of_pages

get_nr_of_pages(url)

5987

In [50]:
# Retrieving the hrefs of the links in the page
nr_of_objects = 4000
hrefs = []

for i in range(1,nr_of_objects//35+2):
    url = 'https://www.booli.se/slutpriser/stockholm/1?page=' + str(i)
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.text, 'html.parser')
    select = soup.find('div', class_="_2m6km uC2y2 _3oDFL")
    links = select.find_all('a')
    links_in_this_page = [link.get('href') for link in links]
    hrefs.extend(links_in_this_page)
    hrefs = hrefs[0:nr_of_objects]

print('Collected', len(hrefs), 'links from', i, 'pages')

Collected 4000 links from 115 pages


In [51]:
def get_apartment_info(url_suffix):
    url = 'https://www.booli.se' + url_suffix
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.text, 'html.parser')
    select = soup.find('div', class_="_2epd7 _12LuH")
    infobox_1 = select.find('div', class_="_2epd7")
    infobox_2 = soup.find('div', class_="_2IyrD _36W0F _16dH_")

    # extracting name, price, size and area from left infobox.
    name = infobox_2.find('h1').text
    price = infobox_2.find('h2').text
    size_and_area = infobox_2.findAll('h4')
    size = size_and_area[0].text
    area = size_and_area[1].text

    # print(name)
    # print(price)
    # print(size)
    # print(area)

    # extracting asking price
    try:
        asking_price_div = infobox_1.select('div:contains("Utropspris")')[0]
        asking_price = asking_price_div.find('div', class_="_18w8g").text
        # print(asking_price, '\n')
    except:
        asking_price = 'N/A'
        # print('No asking price')
        # print('\n')

    return [name, price, size, area, asking_price, url]

In [5]:
# Creating a dataframe with the scraped data, ignoring rows without relevant data

df = pd.DataFrame(columns=['name', 'price', 'size', 'area', 'asking_price', 'url'])

for i in tqdm(range(nr_of_objects)):
    try:
        df.loc[i] = get_apartment_info(hrefs[i])
    except:
        print('Error with', hrefs[i], 'at', i)

NameError: name 'nr_of_objects' is not defined

In [140]:
# saving the dataframe to a csv file
# df.to_csv('housing_info.csv', index=False)

In [6]:
# reading the csv file
df = pd.read_csv('housing_info.csv')
print('Length of dataframe:', len(df))
df_copy = df.head().copy()

Length of dataframe: 3638


## Clean the data

In [None]:
def clean_data(row):
    
    # convert price to int and delete ' kr'
    row['price'] = int(row['price'].replace(' kr', '').replace(' ', ''))

    # split up size into size and #rooms
    size_and_rooms = row['size'].replace('½', '.5').split(',')
    if len(size_and_rooms) > 1:
        row['size'] = float(size_and_rooms[0].split(' ')[0])
        row['rooms'] = float(size_and_rooms[1].split('rum')[0])
    else:
        row['size'] = int(size_and_rooms[0].split(' ')[0])
        row['rooms'] = 'N/A'

    # split up area into area and house type
    area_and_house_type = row['area'].split(',')
    row['area'] = area_and_house_type[1]
    row['type'] = area_and_house_type[0]

    return row

df = df.apply(clean_data, axis=1)

In [20]:
df['price_per_m2'] = df['price'] / df['size']
df['price_per_m2'] = df['price_per_m2'].round(2)

In [22]:
# count average price per m2 for each area
print('Nr of unique areas:', len(df['area'].unique()))
df_average_price_in_area = df.groupby('area')['price_per_m2'].mean().reset_index().round(0)
df_average_price_in_area.rename(columns={'price_per_m2': 'price_per_m2_avg'}, inplace=True)
df_average_price_in_area.sort_values(by='price_per_m2_avg', ascending=False, inplace=True)
df_average_price_in_area.head(10)


Nr of unique areas: 256


Unnamed: 0,area,price_per_m2_avg
205,Södermalm-Högalid,163333.0
202,Södermalm Mosebacke,128409.0
219,Vasastan Odenplan,125631.0
140,Norr Mälarstrand,124589.0
252,Östermalm,124132.0
196,Södermalm - Reimersholme,123864.0
62,Hagastaden Vasastan,123571.0
50,Gamla Stan,122633.0
198,Södermalm / Mosebacke,122078.0
221,Vasastan-Sibirien,121639.0


In [23]:
import json

# convert df_average_price_in_area to a dictionary
average_price_in_area = df_average_price_in_area.set_index('area').T.to_dict('list')

#export to json
with open('average_price_in_area.json', 'w') as f:
    json.dump(average_price_in_area, f)

# apply lambda function to df to get average price per m2 for each area
df['area_price_per_m2'] = df.apply(lambda row: average_price_in_area[row['area']][0], axis=1)

In [24]:
df.head()

Unnamed: 0,name,price,size,area,asking_price,rooms,type,price_per_m2,area_price_per_m2
0,Jämtlands­gatan 152,2600000,78.0,Hässelby-Vällingby,,3.0,Lägenhet,33333.33,57544.0
1,Jämtlands­gatan 97,2800000,66.0,Vällingby Parkstad,2 795 000 kr,2.0,Lägenhet,42424.24,44364.0
2,Dala­gatan 86G,3320000,34.0,Vasastan,3 395 000 kr,1.0,Lägenhet,97647.06,113740.0
3,Valhalla­vägen 155,6500000,67.0,Östermalm Nedre Gärdet,6 200 000 kr,2.5,Lägenhet,97014.93,101733.0
4,Rådmans­gatan 1B,10850000,81.0,Östermalm,11 250 000 kr,3.0,Lägenhet,133950.62,124132.0


Since we don't want to use categorical data, we can replace the area with the average selling price / m2.

In [11]:
# find unique values in area
print(len(df['area'].unique()))

# find number of nan values in rooms
print(len(df[df['rooms']=='N/A']))

# drop nan values in rooms (for now), change this later
df = df[df['rooms']!='N/A']

256
0


## Preparing for training model

In [12]:
df_x = df[['size', 'rooms', 'area_price_per_m2']]
df_y = df['price']

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=42)

In [13]:
#XGB Regressor 
rf_regressor = RandomForestRegressor(n_estimators=20) #max_depth=15
rf_regressor.fit(x_train, y_train)
y_pred = rf_regressor.predict(x_test)

In [14]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

Mean Absolute Error: 701312.3266321853


In [15]:
from sklearn.model_selection import cross_val_score

# use cross validation to find the best number of estimators
scores = cross_val_score(rf_regressor, x_train, y_train, cv=5)
print('Cross validation scores:', scores)
print('Mean cross validation score:', scores.mean())


Cross validation scores: [0.87302593 0.89321212 0.87264059 0.88137574 0.87777733]
Mean cross validation score: 0.8796063414242253


In [16]:
x_test.head(1)

Unnamed: 0,size,rooms,area_price_per_m2
415,58.0,2.0,98282.0


In [17]:
rf_regressor.predict(x_test.head(1))

array([6591125.])

In [38]:
html_page = requests.get('https://www.booli.se/annons/4744318')
soup = BeautifulSoup(html_page.text, 'html.parser')
select = soup.find('div', class_="_2IyrD _36W0F _16dH_")

name = select.find('h1').text
price = select.find('h2').text
size_and_area = select.findAll('h4')
size = size_and_area[0].text
area = size_and_area[1].text

# split up size into size and #rooms
size_and_rooms = size.replace('½', '.5').split(',')
if len(size_and_rooms) > 1:
    size = float(size_and_rooms[0].split(' ')[0])
    rooms = float(size_and_rooms[1].split('rum')[0])
else:
    size = int(size_and_rooms[0].split(' ')[0])
    rooms = 'N/A'

# split up area into area and house type
area_and_house_type = area.split(',')
area = area_and_house_type[1]

# load local json file
with open('average_price_in_area.json') as f:
    average_price_in_area = json.load(f)


df = pd.DataFrame(columns=['size', 'rooms','area'])
df.loc[0] = [size, rooms, area]
df
# apply lambda function to df to get average price per m2 for each area
df['area_price_per_m2'] = df.apply(lambda row: average_price_in_area[row['area']][0], axis=1)

IndexError: list index out of range

In [36]:
df['area_price_per_m2'][0]

105185.0

In [20]:
pickle.dump(rf_regressor, open('rf_regressor.pkl', 'wb'))


In [19]:
type(x_test.head(1))

pandas.core.frame.DataFrame

In [37]:
num = 10000000
print(f"{num:,}")

10,000,000
