# Packages

In [35]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.notebook import tqdm
from fake_useragent import UserAgent
import re
import urllib.parse
import urllib.request
import random
import shutil
import csv
from csv import writer
from pathlib import Path
from urllib.request import urlopen
import json
from geopy.geocoders import GoogleV3, Nominatim
from datetime import date

In [36]:
ua = UserAgent()

## Functions 

In [37]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

In [38]:
def append_list_as_row_list(file_name, list_of_elem):
    with open(file_name, 'a+', newline='', encoding='utf-8-sig') as write_obj:
        csv_writer = writer(write_obj)
        csv_writer.writerow(list_of_elem)

# Link Generation

In [39]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}


pages = set()
def getLinks(pageUrl):
    global pages
    html = requests.get(pageUrl, headers = headers)
    bsObj = BeautifulSoup(html.text, 'html.parser')
    for link in bsObj.findAll("a", href=re.compile("https://myrealty.am/en/apartments-for-sale/763700\?page=\d+$")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                pages.add(newPage)
                getLinks(newPage)
getLinks("https://myrealty.am/en/apartments-for-sale/763700")

# Get Individual Links

In [40]:
links = []

for link in tqdm(pages):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(link, headers = header)
    soup = BeautifulSoup(content.content, 'html.parser')
    
    aptrent_container = soup.find('div', {'class':'row no-gutters items-list'})
    btf_aptrent_container = BeautifulSoup(str(aptrent_container), 'html.parser')
    
    links_raw = btf_aptrent_container.find_all('a', {'class':'btn btn-pink-transparent btn-cs text-uppercase item-more-btn ml-auto'})
    links_clean = [i['href'] for i in links_raw]
    links.append(links_clean)

links = flatten(links)

  0%|          | 0/117 [00:00<?, ?it/s]

In [41]:
len(links)

4195

# Scrape Data

## Create Empty CSV 

In [45]:
columns = ['ID', 'Location', 'Region', 'District', 'Street', 'Price', 'Price/sqm', 'Views', 'Area', 'Room', 'Floor', 'Storeys',
           'Bathrooms', 'BuildType', 'CeilingHeight', 'Condition', 'Date Added', 
           'Date Edited', 'Date Scraped', 'Latitude', 'Longitude', 'Address',
           'Facilities', 'Additional Info', 'url']

data = pd.DataFrame(columns=columns)
data.to_csv('apt.sale.0805.csv', index=False)

## Scrape & Store 

In [46]:
for apt in tqdm(links):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(apt, headers = header)
    
    soup = BeautifulSoup(content.content, 'html.parser')
    
    try:
        test_id = soup.find('div', {'class':'item-view-id'})
        if test_id:
            ID = test_id.text.strip().split(' ')[1]
        else:
            ID = None

        test_location = soup.find('div', {'class':'col-auto item-view-address pl-0 mb-2 mt-1'})
        if test_location:
            Location = test_location.text.strip()
            Region = test_location.text.strip().split(',')[0]
            District = test_location.text.strip().split(',')[1]
            Street = test_location.text.strip().split(',')[2]
        else:
            Location = None
            Region = None
            District = None
            Street = None

        test_price = soup.find('div', {'class':'item-view-price'})
        if test_price:
            Price = test_price.text.strip().split('/')[0].replace(",", "")
            Price_sq = test_price.text.strip().split('/')[1].replace("SQ. M. ", "").replace(",", "")
        else:
            Price = None
            Price_sq = None

        test_views = soup.find('span', {'class':'item-view-count'})
        if test_views:
            Views = test_views.text.strip()
        else:
            Views = None

        details = [i.text for i in soup.find('div', {'class': 'item-view-price-params'}).findAll('span')]
        if details:
            Area = details[0].split()[0]
            Rooms = details[1].replace("+", "")
            Floor = details[2].split('/')[0]
            Storeys = details[2].split('/')[1]
        else:
            Area = None
            Rooms = None
            Floor = None
            Storeys = None

        params = [i.text for i in soup.findAll('div', {'class': 'col-5'})]
        if params:
            Bathrooms = params[1].split()[0].replace("+", "")
            BuildType = params[2].split()[0]
            CeilingHeight = params[3].split()[0]
            Condition = params[4].strip()
        else:
            Bathrooms = None
            BuildType = None
            CeilingHeight = None
            Condition = None
            
        date_id = [i.text for i in soup.findAll('div', {'class': 'row no-gutters flex-column item-view-date'})]
        if date_id:
            DateAdded = str(date_id).split()[2]
            DateEdited = str(date_id).split()[5]
        else:
            DateAdded = None
            DateEdited = None
        DateScraped = date.today().strftime("%d.%m.%Y")
        
        
        test_coord = soup.find("div", {'id': 'yandex_map_item_view'}).attrs
        if test_coord:
            Latitude = test_coord['data-lat']
            Longitude = test_coord['data-lng']
            geolocator1 = Nominatim(user_agent="coordinateconverter")
            location1 = geolocator1.reverse("{}, {}".format(Latitude, Longitude))
            Address = location1.address
            
        else: 
            Latitude = None
            Longitude = None
            Address = None
            
        test_facilities = [i.text for i in soup.findAll('div', {'class': 'row item-view-facilities mb-4'})[0].findAll('label')]
        if test_facilities:
            Facilities = test_facilities
        else:
            Facilities = None

        test_info = [i.text for i in soup.findAll('div', {'class': 'row item-view-facilities mb-4'})[1].findAll('label')]
        if test_info:
            Additional = test_info
        else:
            Additional = None
            
        url = apt
            
    except:
        continue
    content = [ID, Location, Region, District, Street, Price, Price_sq, Views, Area, Rooms, Floor, Storeys,
               Bathrooms, BuildType, CeilingHeight, Condition, DateAdded, DateEdited, DateScraped,
               Latitude, Longitude, Address, Facilities, Additional, url]
    append_list_as_row_list('apt.sale.0805.csv', content)

  0%|          | 0/4195 [00:00<?, ?it/s]

In [47]:
df = pd.read_csv('apt.sale.0805.csv')
len(df)

2830