<a href="https://colab.research.google.com/github/AndreassOlsson/Basic-AI-ML-exploration/blob/main/Webscraping_and_Datahandling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collecting

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import lxml
import re
from scipy import stats

In [None]:
def scrape(townIds, maxListingsForTown=50):
  data = {}
  for townId in townIds:

    numPages = int(maxListingsForTown / 50) if maxListingsForTown >= 50 else 1
    for page in range(numPages):

      url = f"https://www.hemnet.se/salda/bostader?location_ids%5B%5D={str(townId)}&page={str(page)}"
      headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
      r = requests.get(url, headers=headers)
      soup = BeautifulSoup(r.content, 'lxml')
      
      houses = soup.find_all("div", {"class": 'sold-property-listing'})
      for house in houses:

        name = house.select('.sold-property-listing__heading.qa-selling-price-title')
        name = name[0].text.strip() if len(name) > 0 else None

        sizeInfo = house.select('.sold-property-listing__subheading.sold-property-listing__area')
        if len(sizeInfo) > 0:
          sizeInfo = re.sub('\n', ' ', sizeInfo[0].text).replace('  ','').split(' ')
          sizeInfo = list(filter(lambda x: x != '' and x != '\xa0', sizeInfo))

          if len(sizeInfo) > 0:
            size = sizeInfo[0].replace('\xa0m²', '').replace('\xa0','').replace('+','').replace(',','.').replace('rum','') if '\xa0m²' or '+' in sizeInfo[0] else None
            size = int(float(size))
            room = sizeInfo[-1].replace('\xa0rum','')  if '\xa0rum' in sizeInfo[-1] else None

        fee = house.select('div.sold-property-listing__size > div.sold-property-listing__fee')
        fee = int(re.sub('\s+','',fee[0].text).replace('kr/mån','')) if len(fee) > 0 else None
        
        endprice = house.select('div.sold-property-listing__price > div.sold-property-listing__subheading')
        endprice = int(re.sub('\s+','',endprice[0].text).replace('Slutpris','').replace('kr','')) if len(endprice) > 0 else None

        kvmprice = None

        saledate = house.select('div.sold-property-listing__price > div.sold-property-listing__sold-date')
        if len(saledate) > 0:
          saledate = re.sub('\s+ ','',saledate[0].text).replace('Såld ', '').replace(' ', '-')
          saledate = saledate.replace('januari', '01').replace('februari', '02').replace('mars', '03').replace('april', '04').replace('maj', '05').replace('juni', '06').replace('juli', '07').replace('augusti', '08').replace('september', '09').replace('oktober', '10').replace('november', '11').replace('december', '12')
          saledate = list(saledate.split('-'))
          saledate[0], saledate[2] = saledate[2], saledate[0]
          saledate = '-'.join(saledate)

        siteInfo = house.select('div.sold-property-listing__info > div.sold-property-listing__location > div')
        if len(siteInfo) > 0:
          typeofproperty = siteInfo[0].span.title.text

          locInfo = siteInfo[0].text.replace(siteInfo[0].span.text, '')
          locInfo = re.sub('\s+', ' ', locInfo).strip().split(', ')
          district = locInfo[0] if len(locInfo) > 1 else name
          city = locInfo[-1] if len(locInfo) > 0 else None
        
        data[len(data)] = {
            "name": name,
            "Size": size,
            "Room": room,
            "Fee": fee,
            "Endprice": endprice,
            "KvMPrice": kvmprice,
            "Saledate": saledate,
            "City": city,
            "District": district,
            "TypeOfProperty": typeofproperty
          }

  df = pd.DataFrame(data).transpose()
  df.to_csv('housedata2022.csv', index=False)

In [None]:
towns = ['17821','17925','18037','17975','17898','17972']
scrape(towns, maxListingsForTown=250)

# Datahandling

In [None]:
def getMode_of_duplicateListingFeature(df, listing, feature):
  x = np.array(df.loc[df['name']==listing][feature])
  mode = stats.mode(x)[0][0] if len(x) > 0 else None
  return mode

In [None]:
def fillHoles(df, feature):

  houses_noFeature = df[pd.to_numeric(df[feature], errors='coerce').isnull()]
  houses_noFeature_names = sorted(set(np.array(houses_noFeature['name'])))

  dupListings = df.loc[df['name'].isin(houses_noFeature_names)]
  dupListings_withSize = dupListings[pd.to_numeric(dupListings[feature], errors='coerce').notnull()]

  houses_noFeature.head()

  for name in houses_noFeature_names:
    i =  houses_noFeature[houses_noFeature['name']==name].index.values
    mode = getMode_of_duplicateListingFeature(dupListings_withSize, name, feature)

    if mode == None:
      df = df.drop(i)
    else:
      df.loc[i, feature] = mode

  return df

In [None]:
def switchPlaces(hData):
  for index, row in hData.iterrows():
    if row['Size'] < row['Room']:
      row['Size'], row['Room'] = row['Room'], row['Size']
  return hData

In [None]:
def calc_kvmPrice(hData):
  x = len(hData[pd.to_numeric(hData['Endprice'], errors='coerce').isnull()])
  y = len(hData[pd.to_numeric(hData['Size'], errors='coerce').isnull()])
  if x == 0 and y == 0:
    for index, row in hData.iterrows():
      kvmPrice = int(row['Endprice']/row['Size'])
      hData.loc[index, 'KvMPrice'] = kvmPrice
  return hData

In [None]:
def formatColumns(hData, columns):
  for column in columns:
    hData[column] = hData[column].apply(lambda x: x.title())
    hData[column] = hData[column].apply(lambda x: re.split('-', x.replace('/', '-'))[0])
  return hData

In [None]:
def clean_districts(hData):
  for index, row in hData.iterrows():
    try:
      i = int(re.search(r'\d+', row['District']).group())
      if len(str(i)) == 5:
        row['District'] = row['name']
    except:
      pass
    
    # Also fill in empty values
    if str(row['District']) == '0':
      row['District'] = row['name']
  return hData

In [None]:
def clean_typeofproperty(hData):
  d = {'rätt': 'Bostadsrättslägenhet',
      'Lägenhet': 'Bostadsrättslägenhet',
      'Par': 'Radhus',
      'illa': 'Villa',
      'ård': 'Gård',
      'hus': 'Fritidshus',
      }

  for index, row in hData.iterrows():
    if any(row['TypeOfProperty'] in i for i in list(d.keys())):
      hData.loc[index, 'TypeOfProperty'] = d[row['TypeOfProperty']]
    elif not any(row['TypeOfProperty'] in i for i in list(d.values())):
      hData.loc[index, 'TypeOfProperty'] = 'Other'
  return hData

In [None]:
def cleanHousingData(hData):
  # If house without feature has been sold before, the feature value of that listing gets used in the house without feature
  # If not possible, the house is removed from the dataset
  features = ['Size', 'Room']
  for feature in features:
    hData = fillHoles(hData, feature)
  
  # Convert the room and size variables from strings to ints.
  hData = hData.astype({'Size': int, 'Room': int}, errors='raise')

  # Check if there are rows with more rooms than square meter, if so is the case switch place on these values
  hData = switchPlaces(hData)

  # Replace all the question marks with zeros, and do the same for NaN values.
  hData.fillna(value = 0, inplace = True)

  # Calculate the square meter price by dividing the selling price by the housing size.
  hData = calc_kvmPrice(hData)

  # fix all the districts to lowercase letters, except the first.
  hData = formatColumns(hData, ['District', 'TypeOfProperty'])

  # remove all the districts with postcodes (or addresses), if you want you can replace them with the names.
  hData = clean_districts(hData)

  # fix all the names that contains “rätt“ to be “Bostadsrättslägenhet”
  # fix all the names that contains “Par/Rad/Kedjehus“ to be “Radhus”
  # fix all the names that contains “illa“ to be “Villa”
  # fix all the names that contains “ård“ to be “Gård”
  # fix all the names that contains “hus“ to be “Fritidshus”
  # All the rest should be “other”
  hData = clean_typeofproperty(hData)

  return hData

In [None]:
hData22 = pd.read_csv('housedata2022.csv')
cleanHousingData(hData22).to_csv('housedata2022_clean.csv', index=False)

In [None]:
hData = pd.read_csv('/content/drive/MyDrive/Assignments/2/housedata.csv')
cleanHousingData(hData).to_csv('housedata_clean.csv', index=False)