# Predicting House Prices in Zagreb, Croatia

#### Importing important importable libraries

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
import geocoder
from geopy.geocoders import Nominatim
import folium
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import html2text
import re
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

### Part 1: Preparing the data

#### html2text helper function

In [2]:
def to_text(html, rehtml=False):
    parser = html2text.HTML2Text()
    parser.wrap_links = False
    parser.skip_internal_links = True
    parser.inline_links = True
    parser.ignore_anchors = True
    parser.ignore_images = True
    parser.ignore_emphasis = True
    parser.ignore_links = True
    text = parser.handle(html)
    text = text.strip(' \t\n\r')
    if rehtml:
        text = text.replace('\n', '<br/>')
        text = text.replace('\\', '')
    return text

#### Loading Neighborhoods.xlsx with neighborhood latitudes and longitudes

In [39]:
df = pd.read_excel('Neighborhoods.xlsx', header=None)
df.columns = ['Neighborhood','Latitude','Longitude','Search']
df

Unnamed: 0,Neighborhood,Latitude,Longitude,Search
0,Podsljemenska zona,45.853512,15.950810,podsljemenska%20zona
1,Donji Grad,45.809355,15.977507,donji%20grad
2,Markusevec,45.875910,16.014813,markusevec
3,Maksimir,45.819612,16.015636,maksimir
4,Pantovcak,45.829362,15.961522,pantovcak
5,Crnomerec,45.822023,15.934625,crnomerec
6,Dubrava,45.830212,16.057348,dubrava
7,Centar,45.813092,15.977190,centar
8,Sestine,45.851802,15.950142,sestine
9,Bukovac,45.837831,16.010611,bukovac


#### Parsing www.gohome.hr for house prices

In [17]:
list_of_zones = []
for i in range(0,df.shape[0]):
    list_of_zones.append(df.loc[i,'Search'])
neigh = []
cijena = []
m_2 = []

In [18]:
for zone in list_of_zones:
    print(zone)
    for page in range(1,21):
        url = 'https://www.gohome.hr/nekretnine.aspx?q=ku%e6a%20zagreb%20'+zone+'&str='+str(page)
        req = urllib.request.urlopen(url)
        article = req.read().decode('latin-1')

        with open('local.html', 'w', encoding='latin-1') as fo:
            fo.write(article)

        article = open('local.html',encoding="latin-1").read()
        text = to_text(article)
        listaPOC = [m.start() for m in re.finditer('Kuæa, ZAGREB, ', text)]
        if len(listaPOC)>0:
            listaPOC = [x+14 for x in listaPOC]
            for i in range(0,len(listaPOC)):
                inter = [m.start() for m in re.finditer('m2', text[listaPOC[i]:listaPOC[i]+100])]
                if len(inter)>=2:
                    listaKRAJ = listaPOC[i]+inter[0]-1
                    inter2 = [m.start() for m in re.finditer(',', text[listaPOC[i]:listaKRAJ])]
                    if len(inter2)==3:
                        neigh.append(text[listaPOC[i]:listaPOC[i]+inter2[0]])
                        cijena.append(text[listaPOC[i]+inter2[0]+2:listaPOC[i]+inter2[1]-2])
                        m_2.append(text[listaPOC[i]+inter2[1]+2:listaPOC[i]+inter2[2]])
                    else:
                        neigh.append('NaN')
                        cijena.append('NaN')
                        m_2.append('NaN')
                else:
                    neigh.append('NaN')
                    cijena.append('NaN')
                    m_2.append('NaN')
        else:
            break

podsljemenska%20zona
donji%20grad
markusevec
maksimir
pantovcak
crnomerec
dubrava
centar
sestine
bukovac
trnje
medvescak
jarun
jelenovac
mlinovi
podsused
mikulici
donja%20kustosija
volovcica
tresnjevka
laniste
gracani
vrapce
gospocak
stenjevec
precko
zagreb%20jug
malesnica
kustosija
rudes
remetinec
stupnik
staglisce
gajnice
gornje%20vrapce
zitnjak
sveta%20klara
ravnice
remete
borongaj
salata
borcec
sveti%20duh
kruge
dugave
vrhovec
retkovec
ferenscica
granesina
tuskanac
gornji%20bukovac
vrbik
vukomerec
borovje
spansko
folnegovicevo
sopot
botinec
kajzerica
bizek
savski%20gaj
britanski%20trg
trokut
ksaver
brezovica
trnsko
siget
srednjaci
vrbani
savica


#### Correcting diacritical signs

In [33]:
cijena_final=[]
neigh_final=[]
m_2_final=[]

for i in range(0,len(cijena)):
    if neigh[i]!='NaN':
        cijena_final.append(cijena[i])
        
        if neigh[i]=='Marku¹evec':
            neigh[i]='Markusevec'
       
        if neigh[i]=='Mikuliæi':
            neigh[i]='Mikulici'
        
        if neigh[i]=='©estine':
            neigh[i]='Sestine'
        
        if neigh[i]=='Graèani':
            neigh[i]='Gracani'
        
        if neigh[i]=='Tu¹kanac':
            neigh[i]='Tuskanac'
        
        if neigh[i]=='Èrnomerec':
            neigh[i]='Crnomerec'
        
        if neigh[i]=='Stagli¹æe':
            neigh[i]='Staglisce'
        
        if neigh[i]=='Medve¹èak':
            neigh[i]='Medvescak'
        
        if neigh[i]=='Jaku¹evac':
            neigh[i]='Jakusevac'
        
        if neigh[i]=='Rude¹':
            neigh[i]='Rudes'
        
        if neigh[i]=='®itnjak':
            neigh[i]='Zitnjak'
        
        if neigh[i]=='Borèec':
            neigh[i]='Borcec'
        
        if neigh[i]=='Feren¹èica':
            neigh[i]='Ferenscica'
        
        if neigh[i]=='Vrapèe':
            neigh[i]='Vrapce'
        
        if neigh[i]=='Pantovèak':
            neigh[i]='Pantovcak'
        
        if neigh[i]=='Lani¹te':
            neigh[i]='Laniste'
        
        if neigh[i]=='Kusto¹ija':
            neigh[i]='Kustosija'
        
        if neigh[i]=='Gospoèak':
            neigh[i]='Gospocak'
        
        if neigh[i]=='Preèko':
            neigh[i]='Precko'
        
        if neigh[i]=='Donja Kusto¹ija':
            neigh[i]='Donja Kustosija'
            
        if neigh[i]=='©alata':
            neigh[i]='Salata'

        if neigh[i]=='Grane¹ina':
            neigh[i]='Granesina'
            
        if neigh[i]=='©pansko':
            neigh[i]='Spansko'
            
        if neigh[i]=='Volovèica':
            neigh[i]='Volovcica'
            
        if neigh[i]=='Sigeèica':
            neigh[i]='Sigecica'
            
        if neigh[i]=='Male¹nica':
            neigh[i]='Malesnica'
            
        if neigh[i]=='Gornje Vrapèe':
            neigh[i]='Gornje Vrapce'
            
        if neigh[i]=='Folnegoviæevo':
            neigh[i]='Folnegovicevo'
            
        if neigh[i]=='Tre¹njevka':
            neigh[i]='Tresnjevka'
            
        neigh_final.append(neigh[i])
        m_2_final.append(m_2[i])

In [51]:
cijena_final_int = [int(x.replace('.', '')) for x in cijena_final]
m2_final_int = [int(x) for x in m_2_final]

#### Connecting houses with latitudes and longitudes in a dataframe

In [91]:
data = pd.DataFrame([])
for i in range(0,len(neigh_final)):
    data = pd.concat([data,pd.DataFrame([[neigh_final[i],cijena_final_int[i],m2_final_int[i]]])],axis=0)
    
data.columns = ['Neighborhood','Price','m2']
data.reset_index(inplace=True)
data.drop('index', axis=1,inplace=True)
data.drop_duplicates(keep='first', inplace=True)
data.reset_index(inplace=True)
data.drop('index', axis=1,inplace=True)
data['Latitude']=np.zeros((data.shape[0],1))
data['Longitude']=np.zeros((data.shape[0],1))

In [94]:
for i in range(0,df.shape[0]):
    for j in range(0,data.shape[0]):
        if df.loc[i,'Neighborhood']==data.loc[j,'Neighborhood']:
            data.loc[j,'Latitude']=df.loc[i,'Latitude']
            data.loc[j,'Longitude']=df.loc[i,'Longitude']

In [113]:
data.to_csv('data.csv')

In [117]:
data

Unnamed: 0,Neighborhood,Price,m2,Latitude,Longitude
0,Podsljemenska zona,259000,330,45.853512,15.950810
1,Podsljemenska zona,280000,200,45.853512,15.950810
2,Markusevec,490000,250,45.875910,16.014813
3,Markusevec,190000,212,45.875910,16.014813
4,Mikulici,150000,300,45.845897,15.928902
5,Podsljemenska zona,600000,236,45.853512,15.950810
6,Sestine,315000,125,45.851802,15.950142
7,Podsljemenska zona,240000,119,45.853512,15.950810
8,Ksaver,269000,189,45.831233,15.976994
9,Sestine,999000,330,45.851802,15.950142


#### Visualizing neighborhoods

In [118]:
address = 'Zagreb'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Zagreb are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Zagreb are 45.813177, 15.977048.


In [137]:
# create map of Zagreb using latitude and longitude values
map_zagreb = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_zagreb)  
    
map_zagreb

In [138]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20191023' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NUKT400LTRSZBTM3J5VME30WD5GSPPHU5JFWIBC2GSML3NOY
CLIENT_SECRET:SD4JXZFMC22RZDQKJM3X5OKIDTPDQH5JX2NJ33P2CGGIRFX3


#### Getting venues for their respective neighborhoods

In [139]:
# function that extracts the category of the venue

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 1000 # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=radius):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [140]:
zagreb_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Podsljemenska zona
Donji Grad
Markusevec
Maksimir
Pantovcak
Crnomerec
Dubrava
Centar
Sestine
Bukovac
Trnje
Medvescak
Jarun
Jelenovac
Mlinovi
Podsused
Mikulici
Donja Kustosija
Volovcica
Tresnjevka
Laniste
Gracani
Vrapce
Gospocak
Stenjevec
Precko
Zagreb Jug
Malesnica
Kustosija
Rudes
Remetinec
Stupnik
Staglisce
Gajnice
Gornje Vrapce
Zitnjak
Sveta Klara
Ravnice
Remete
Borongaj
Salata
Borcec
Sveti Duh
Kruge
Dugave
Vrhovec
Retkovec
Ferenscica
Granesina
Tuskanac
Gornji Bukovac
Vrbik
Vukomerec
Borovje
Spansko
Folnegovicevo
Sopot
Botinec
Kajzerica
Bizek
Savski Gaj
Britanski Trg
Trokut
Ksaver
Brezovica
Trnsko
Siget
Srednjaci
Vrbani
Savica
Jakusevac
Sigecica


#### Assigning the same venues to each of the houses in the same neighborhood

In [143]:
# one hot encoding
zagreb_onehot = pd.get_dummies(zagreb_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
zagreb_onehot['Neighborhood'] = zagreb_venues['Neighborhood'] 

# move neighborhood column to the first column
cols = zagreb_onehot.columns.tolist()
cols.insert(0, cols.pop(cols.index('Neighborhood')))
zagreb_onehot = zagreb_onehot.reindex(columns= cols)

In [158]:
zagreb_onehot_grouped = zagreb_onehot.groupby(zagreb_onehot['Neighborhood']).sum()
zagreb_onehot_grouped.reset_index(inplace=True)

In [194]:
columns = zagreb_onehot_grouped.columns
columns = columns[1:]
for i in range(0,len(columns)):
    data[columns[i]]=np.zeros((data.shape[0],1))

In [200]:
for i in range(0,data.shape[0]):
    for j in range(0,zagreb_onehot_grouped.shape[0]):
        if data.loc[i,'Neighborhood']==zagreb_onehot_grouped.loc[j,'Neighborhood']:
            data.loc[i,columns]=zagreb_onehot_grouped.loc[j,columns] 

In [202]:
data.to_csv('data_complete.csv')

In [210]:
data

Unnamed: 0,Neighborhood,Price,m2,Latitude,Longitude,Accessories Store,Airport Service,American Restaurant,Art Gallery,Art Museum,...,Train Station,Tram Station,Tree,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vineyard,Whisky Bar,Wine Bar,Women's Store,Zoo
0,Podsljemenska zona,259000,330,45.853512,15.950810,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Podsljemenska zona,280000,200,45.853512,15.950810,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Markusevec,490000,250,45.875910,16.014813,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Markusevec,190000,212,45.875910,16.014813,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Mikulici,150000,300,45.845897,15.928902,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Podsljemenska zona,600000,236,45.853512,15.950810,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Sestine,315000,125,45.851802,15.950142,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Podsljemenska zona,240000,119,45.853512,15.950810,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Ksaver,269000,189,45.831233,15.976994,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Sestine,999000,330,45.851802,15.950142,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Part 2: Linear Regression

In [73]:
data = pd.read_csv('data_complete.csv')
data.drop('Unnamed: 0',axis=1,inplace=True)
neighborhood=pd.DataFrame(data['Neighborhood'])
columns = data.columns
columns = columns[5:]
columns = columns.tolist()

#### Taking as features only the venues that correlate the most with house prices

In [74]:
df=pd.DataFrame([])
max_kor=0 
populate=0
while populate<40:
    for i in columns:
        kor = (data['Price']).corr(data[i])
        if abs(kor)>max_kor:
            max_kor=abs(kor)
            target=i
    df=pd.concat([df,data[target]],axis=1)    
    populate=populate+1
    max_kor=0
    columns.remove(target)

data = pd.concat([df,data['m2'],data['Price']],axis=1)
columns = data.columns
columns = columns[:-2]

In [75]:
skor=0

while skor<0.65:
    features = data[columns]
    alles = pd.concat([features,data['m2']],axis=1)
    del features
    
    poly = PolynomialFeatures(degree=1)
    alles_poly = poly.fit_transform(alles)
    del alles
    alles_preprocessed = preprocessing.StandardScaler().fit(alles_poly).transform(alles_poly.astype(float))
    del alles_poly
    alles_final = pd.concat([pd.DataFrame(alles_preprocessed),data['Price']],axis=1)
    del alles_preprocessed
    
    '''
    alles_preprocessed = preprocessing.MinMaxScaler().fit(alles).transform(alles.astype(float))
    del alles
    poly = PolynomialFeatures(degree=1)
    alles_poly = poly.fit_transform(alles_preprocessed)
    del alles_preprocessed
    alles_final = pd.concat([pd.DataFrame(alles_poly),data['Price']],axis=1)
    del alles_poly
    '''
    
    #del data
    msk = np.random.rand(len(alles_final)) < 0.8
    train = alles_final[msk]
    test = alles_final[~msk]
    del msk
    del alles_final

    features = train.columns
    features = features[:-1]

    X_train = train[features] 
    X_test = test[features]
    y_train = train['Price']
    y_test = test['Price']

    regr = linear_model.LinearRegression()
    regr.fit (X_train, y_train)
    # The coefficients
    # print ('Coefficients: ', regr.coef_)

    y_hat= regr.predict(X_test)
    x = np.asanyarray(X_test)
    y = np.asanyarray(y_test)
    # print("Residual sum of squares: %.2f" % np.mean((y_hat - y) ** 2))

    skor = regr.score(x, y)
    if skor>0:
        print(skor)
    
    # Explained variance score: 1 is perfect prediction
    # print('Variance score: %.2f' % skor)

0.5397524386907793
0.6021878003247347
0.5107309054751661
0.5333232393560303
0.6000501872819425
0.6004946056267912
0.548123662652309
0.5622772078661171
0.5351075462382182
0.5254340839229701
0.5792939763188905
0.553844212883768
0.5476191890510113
0.49458459418347006
0.6030148983389866
0.4958174802762859
0.5478435695779977
0.4990442126145697
0.49376062274367105
0.5839464031309871
0.5760198327062891
0.5735514910466135
0.5378502255742155
0.4863954736259286
0.5742323732203262
0.5388416851458524
0.5088432923903741
0.5271740929751062
0.5300885405270745
0.5729911269709196
0.5646792189260608
0.48424905708951016
0.5577826056700168
0.5128492589455563
0.5631159349478417
0.5194069029710744
0.5390186535875723
0.5282737637974287
0.5249604387729776
0.5790544401527473
0.5766439042823728
0.5650318985979401
0.5253502088271863
0.5301003080165154
0.5186806420763541
0.5295639522841136
0.5153433499947218
0.48761875506514596
0.5461110534998792
0.5152852954800803
0.5196166900382408
0.5133079192070829
0.53985689

0.5904150949533364
0.5122283861696458
0.5070592364422778
0.5506274563904928
0.5393806245771897
0.5093201640141161
0.5197294623897744
0.5292727034039234
0.5986029631533413
0.5258067114112701
0.5430729360113828
0.556050733962917
0.5448283982060396
0.5582938127694025
0.4955051714205787
0.590142064998281
0.5651587137857046
0.5114347458475905
0.5587603520953253
0.5252015347474482
0.5160414761958156
0.5459534298583348
0.493662888941076
0.5122673926512734
0.5762094257741658
0.5350439025174358
0.5540255695027145
0.46619889035361706
0.4974239814274598
0.4860781136054509
0.558283003544699
0.5077322682487307
0.5366480161339025
0.6102339675845081
0.5577911517360885
0.5160046721517393
0.5563212417517426
0.561711264477933
0.57979045248154
0.5177019410621626
0.4841896535269409
0.6275620914691578
0.5770897789702971
0.5481802456581321
0.522232403753841
0.5550163491019477
0.5379810844475881
0.4802226923727225
0.4871715794098643
0.5022289509697166
0.5453640939060393
0.48045344857692274
0.5441024600402984

#### Final score

In [77]:
print("Residual sum of squares: %.2f" % np.mean((y_hat - y) ** 2))

print('Variance score: %.2f' % skor)

Residual sum of squares: 47278303818.68
Variance score: 0.65
