# Packages

In [33]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.notebook import tqdm
from fake_useragent import UserAgent
import re
import urllib.parse
import urllib.request
import random
import shutil
import csv
from csv import writer
from pathlib import Path
from urllib.request import urlopen
import json
from geopy.geocoders import GoogleV3, Nominatim
from datetime import date

In [34]:
ua = UserAgent()

## Functions 

In [35]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

In [36]:
def append_list_as_row_list(file_name, list_of_elem):
    with open(file_name, 'a+', newline='', encoding='utf-8-sig') as write_obj:
        csv_writer = writer(write_obj)
        csv_writer.writerow(list_of_elem)

# Link Generation

In [37]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}


pages = set()
def getLinks(pageUrl):
    global pages
    html = requests.get(pageUrl, headers = headers)
    bsObj = BeautifulSoup(html.text, 'html.parser')
    for link in bsObj.findAll("a", href=re.compile("https://myrealty.am/en/houses-for-sale/769857\?page=\d+$")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                pages.add(newPage)
                getLinks(newPage)
getLinks("https://myrealty.am/en/houses-for-sale/769857")

# Get Individual Links

In [38]:
links = []

for link in tqdm(pages):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(link, headers = header)
    soup = BeautifulSoup(content.content, 'html.parser')
    
    aptrent_container = soup.find('div', {'class':'row no-gutters items-list'})
    btf_aptrent_container = BeautifulSoup(str(aptrent_container), 'html.parser')
    
    links_raw = btf_aptrent_container.find_all('a', {'class':'btn btn-pink-transparent btn-cs text-uppercase item-more-btn ml-auto'})
    links_clean = [i['href'] for i in links_raw]
    links.append(links_clean)

links = flatten(links)

  0%|          | 0/63 [00:00<?, ?it/s]

In [39]:
len(links)

2257

# Scrape Data

## Create Empty CSV 

In [40]:
columns = ['ID', 'Location', 'Region', 'District', 'Street', 'Price', 'Price/sqm', 'Views', 'Area',
           'Room', 'Storeys','Bathrooms', 'BuildType', 'CeilingHeight', 'Condition',
           'Date Added', 'Date Edited', 'Date Scraped', 'Latitude', 'Longitude', 'Address',
           'Facilities', 'Additional Info', 'url']
data = pd.DataFrame(columns=columns)
data.to_csv('house.sale.0805.csv', index=False)

## Scrape & Store 

In [41]:
for apt in tqdm(links):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(apt, headers = header)
    
    soup = BeautifulSoup(content.content, 'html.parser')
    
    try:
        test_id = soup.find('div', {'class':'item-view-id'})
        if test_id:
            ID = test_id.text.strip().split(' ')[1]
        else:
            ID = None

        test_location = soup.find('div', {'class':'col-auto item-view-address pl-0 mb-2 mt-1'})
        if test_location:
            Location = test_location.text.strip()
            Region = test_location.text.strip().split(',')[0]
            District = test_location.text.strip().split(',')[1]
            Street = test_location.text.strip().split(',')[2]
        else:
            Location = None
            Region = None
            District = None
            Street = None

        test_price = soup.find('div', {'class':'item-view-price'})
        if test_price:
            Price = test_price.text.strip().split('/')[0].replace(",", "")
            Price_sq = test_price.text.strip().split('/')[1].replace("SQ. M. ", "").replace(",", "")
        else:
            Price = None
            Price_sq = None

        test_views = soup.find('span', {'class':'item-view-count'})
        if test_views:
            Views = test_views.text.strip()
        else:
            Views = None

        details = [i.text for i in soup.find('div', {'class': 'item-view-price-params'}).findAll('span')]
        if details:
            Area = details[0].split()[0]
            Rooms = details[1].replace("+", "")
            Storeys = details[2].split('/')[0]
        else:
            Area = None
            Rooms = None
            Storeys = None

        params = [i.text for i in soup.findAll('div', {'class': 'col-5'})]
        if params:
            Bathrooms = params[3].split()[0].replace("+", "")
            BuildType = params[4].split()[0]
            CeilingHeight = params[5].split()[0]
            Condition = params[6].strip()
        else:
            Bathrooms = None
            BuildType = None
            CeilingHeight = None
            Condition = None
            
        date_id = [i.text for i in soup.findAll('div', {'class': 'row no-gutters flex-column item-view-date'})]
        if date_id:
            DateAdded = str(date_id).split()[2]
            DateEdited = str(date_id).split()[5]
        else:
            DateAdded = None
            DateEdited = None
        DateScraped = date.today().strftime("%d.%m.%Y")
        
        
        test_coord = soup.find("div", {'id': 'yandex_map_item_view'}).attrs
        if test_coord:
            Latitude = test_coord['data-lat']
            Longitude = test_coord['data-lng']
            geolocator1 = Nominatim(user_agent="coordinateconverter")
            location1 = geolocator1.reverse("{}, {}".format(Latitude, Longitude))
            Address = location1.address
            
        else: 
            Latitude = None
            Longitude = None
            Address = None
            
        test_facilities = [i.text for i in soup.findAll('div', {'class': 'row item-view-facilities mb-4'})[0].findAll('label')]
        if test_facilities:
            Facilities = test_facilities
        else:
            Facilities = None

        test_info = [i.text for i in soup.findAll('div', {'class': 'row item-view-facilities mb-4'})[1].findAll('label')]
        if test_info:
            Additional = test_info
        else:
            Additional = None
            
        url = apt
            
    except:
        continue
    content = [ID, Location, Region, District, Street, Price, Price_sq, Views, Area, Rooms,
                   Storeys, Bathrooms, BuildType, CeilingHeight, Condition, DateAdded, DateEdited,
                   DateScraped, Latitude, Longitude, Address, Facilities, Additional,  url]
    append_list_as_row_list('house.sale.0805.csv', content)

  0%|          | 0/2257 [00:00<?, ?it/s]

In [43]:
df = pd.read_csv('house.sale.0805.csv')
df

Unnamed: 0,ID,Location,Region,District,Street,Price,Price/sqm,Views,Area,Room,...,Condition,Date Added,Date Edited,Date Scraped,Latitude,Longitude,Address,Facilities,Additional Info,url
0,115611,"Kotayk, Qasakh, Qasakh",Kotayk,Qasakh,Qasakh,92000,153,774.0,600,4,...,Newly repaired,25.10.2021,25.10.2021,08.05.2022,40.235526,44.451442,"9, Վազգեն Սարգսյանի փողոց, Քասախ, Եղվարդի տարա...","['Heating', 'Hot water', 'Electricity', 'Gas',...","['Euro windows', 'Tile', 'Sunny', 'View', 'Park']",https://myrealty.am/en/house-for-sale/Qasakh/Q...
1,115609,"Yerevan, Nor Norq, Bagrevand district",Yerevan,Nor Norq,Bagrevand district,330000,471,1370.0,700,5,...,Newly repaired,25.10.2021,25.10.2021,08.05.2022,40.192484,44.586332,"Բագրևանդ, Նոր Նորք, Ջրվեժ, 0089, Հայաստան","['Heating', 'Hot water', 'Electricity', 'Gas',...","['Furniture', 'Equipment', 'Euro windows', 'Ir...",https://myrealty.am/en/house-for-sale/Bagrevan...
2,115596,"Yerevan, Avan, Avan 5 St",Yerevan,Avan,Avan 5 St,130000,260,1125.0,500,4,...,Newly repaired,25.10.2021,25.10.2021,08.05.2022,40.147115,44.516820,"Կենտրոնական զինվորական հիվանդանոց, Նոր Արեշի 5...","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Ir...",https://myrealty.am/en/house-for-sale/Avan+5+S...
3,115593,"Yerevan, Norq Marash, Armenak Armenakyan St",Yerevan,Norq Marash,Armenak Armenakyan St,1700000,1133,1173.0,1500,7,...,Newly repaired,24.10.2021,24.10.2021,08.05.2022,40.189037,44.530844,"73, Արմենակ Արմենակյան փողոց, Նորք Մարաշ, Երևա...","['Heating', 'Internet', 'Electricity', 'Air-co...","['Furniture', 'Equipment', 'Fireplace', 'Euro ...",https://myrealty.am/en/house-for-sale/Armenak+...
4,115587,"Yerevan, Avan, Paruyr Sevak 11",Yerevan,Avan,Paruyr Sevak 11,350000,583,928.0,600,4,...,Newly repaired,24.10.2021,24.10.2021,08.05.2022,40.228469,44.579354,"29, Պարույր Սևակ թաղ 1-ին փողոց, Առինջ, Աբովյա...","['Heating', 'Internet', 'Hot water', 'Electric...","['Open balcony', 'Attic', 'Sunny', 'View', 'Ga...",https://myrealty.am/en/house-for-sale/Paruyr+S...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,82896,"Yerevan, Arabkir, Orbeli Yeghbayrner St",Yerevan,Arabkir,Orbeli Yeghbayrner St,500000,2000,3541.0,250,7,...,Newly repaired,16.08.2019,03.08.2021,08.05.2022,40.194289,44.495078,"15, Օրբելի եղբայրների փողոց, Արաբկիր, Երևան, 0...","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Op...",https://myrealty.am/en/house-for-sale/Orbeli+Y...
1496,82868,"Yerevan, Arabkir, V.Papazyan St",Yerevan,Arabkir,V.Papazyan St,200000,333,3799.0,600,5,...,Good,16.08.2019,18.08.2021,08.05.2022,40.204745,44.501841,"21, Վահրամ Փափազյան փողոց, Արաբկիր, Երևան, 001...","['Heating', 'Gas', 'Hot water', 'Internet', 'E...","['Furniture', 'Euro windows', 'Tile', 'Roadsid...",https://myrealty.am/en/house-for-sale/V.Papazy...
1497,82828,"Yerevan, Norq Marash, Norq 7 St",Yerevan,Norq Marash,Norq 7 St,240000,1200,5070.0,200,4,...,Zero condition,15.08.2019,08.07.2021,08.05.2022,40.177734,44.544996,"27, Նորքի 17-րդ փողոց, Նորք Մարաշ, Երևան, 0047...","['Heating', 'Gas', 'Electricity', 'water 24/7']","['Open balcony', 'Balcony', 'Loggia', 'Euro wi...",https://myrealty.am/en/new-build-house-for-sal...
1498,82802,"Kotayk, Proshyan, Gayi street",Kotayk,Proshyan,Gayi street,87000,67,2944.0,1300,5,...,Newly repaired,14.08.2019,14.08.2019,08.05.2022,40.250779,44.419895,"5/2, Բարեկամության փողոց, Պռոշյան, Եղվարդի տար...","['Water', 'water 24/7', 'Electricity', 'Centra...","['Euro windows', 'Basement']",https://myrealty.am/en/house-for-sale/Gayi+str...
