# Packages

In [44]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.notebook import tqdm
from fake_useragent import UserAgent
import re
import urllib.parse
import urllib.request
import random
import shutil
import csv
from csv import writer
from pathlib import Path
from urllib.request import urlopen
import json
from geopy.geocoders import GoogleV3, Nominatim
from datetime import date

In [45]:
ua = UserAgent()

## Functions 

In [46]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

In [47]:
def append_list_as_row_list(file_name, list_of_elem):
    with open(file_name, 'a+', newline='', encoding='utf-8-sig') as write_obj:
        csv_writer = writer(write_obj)
        csv_writer.writerow(list_of_elem)

# Link Generation

In [48]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}


pages = set()
def getLinks(pageUrl):
    global pages
    html = requests.get(pageUrl, headers = headers)
    bsObj = BeautifulSoup(html.text, 'html.parser')
    for link in bsObj.findAll("a", href=re.compile("https://myrealty.am/en/apartments-for-rent/756855\?page=\d+$")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                pages.add(newPage)
                getLinks(newPage)
getLinks("https://myrealty.am/en/apartments-for-rent/756855")

In [49]:
len(pages)

175

# Get Individual Links

In [50]:
links = []

for link in tqdm(pages):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(link, headers = header)
    soup = BeautifulSoup(content.content, 'html.parser')
    
    aptrent_container = soup.find('div', {'class':'row no-gutters items-list'})
    btf_aptrent_container = BeautifulSoup(str(aptrent_container), 'html.parser')
    
    links_raw = btf_aptrent_container.find_all('a', {'class':'btn btn-pink-transparent btn-cs text-uppercase item-more-btn ml-auto'})
    links_clean = [i['href'] for i in links_raw]
    links.append(links_clean)


links = flatten(links)

  0%|          | 0/175 [00:00<?, ?it/s]

In [51]:
len(links)

6282

# Scrape Data

## Create Empty CSV 

In [52]:
columns = ['ID', 'Location', 'Region', 'District', 'Street', 'Price', 'Views', 'Area', 'Room', 'Floor', 'Storeys',
           'Bathrooms', 'BuildType', 'CeilingHeight', 'Condition', 'Date Added', 
           'Date Edited', 'Date Scraped', 'Latitude', 'Longitude',  'Address',
           'Facilities', 'Additional Info', 'url']

data = pd.DataFrame(columns=columns)
data.to_csv('apt.rent.0805.csv', index=False)

## Scrape & Store 

In [53]:
for apt in tqdm(links):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(apt, headers = header)
    
    soup = BeautifulSoup(content.content, 'html.parser')
    
    try:
        test_id = soup.find('div', {'class':'item-view-id'})
        if test_id:
            ID = test_id.text.strip().split(' ')[1]
        else:
            ID = None

        test_location = soup.find('div', {'class':'col-auto item-view-address pl-0 mb-2 mt-1'})
        if test_location:
            Location = test_location.text.strip()
            Region = test_location.text.strip().split(',')[0]
            District = test_location.text.strip().split(',')[1]
            Street = test_location.text.strip().split(',')[2]
        else:
            Location = None
            Region = None
            District = None
            Street = None

        test_price = soup.find('div', {'class':'item-view-price'})
        if test_price:
            Price = test_price.text.strip().replace(",", "")
        else:
            Price = None

        test_views = soup.find('span', {'class':'item-view-count'})
        if test_views:
            Views = test_views.text.strip()
        else:
            Views = None

        details = [i.text for i in soup.find('div', {'class': 'item-view-price-params'}).findAll('span')]
        if details:
            Area = details[0].split()[0]
            Rooms = details[1].replace("+", "")
            Floor = details[2].split('/')[0]
            Storeys = details[2].split('/')[1]
        else:
            Area = None
            Rooms = None
            Floor = None
            Storeys = None

        params = [i.text for i in soup.findAll('div', {'class': 'col-5'})]
        if params:
            Bathrooms = params[0].split()[0].replace("+", "")
            BuildType = params[1].split()[0]
            CeilingHeight = params[2].split()[0]
            Condition = params[3].strip()
        else:
            Bathrooms = None
            BuildType = None
            CeilingHeight = None
            Condition = None
            
        date_id = [i.text for i in soup.findAll('div', {'class': 'row no-gutters flex-column item-view-date'})]
        if date_id:
            DateAdded = str(date_id).split()[2]
            DateEdited = str(date_id).split()[5]
        else:
            DateAdded = None
            DateEdited = None
        DateScraped = date.today().strftime("%d.%m.%Y")
        
        
        test_coord = soup.find("div", {'id': 'yandex_map_item_view'}).attrs
        if test_coord:
            Latitude = test_coord['data-lat']
            Longitude = test_coord['data-lng']
            geolocator1 = Nominatim(user_agent="coordinateconverter")
            location1 = geolocator1.reverse("{}, {}".format(Latitude, Longitude))
            Address = location1.address
            
        else: 
            Latitude = None
            Longitude = None
            Address = None
            
        test_facilities = [i.text for i in soup.findAll('div', {'class': 'row item-view-facilities mb-4'})[0].findAll('label')]
        if test_facilities:
            Facilities = test_facilities
        else:
            Facilities = None

        test_info = [i.text for i in soup.findAll('div', {'class': 'row item-view-facilities mb-4'})[1].findAll('label')]
        if test_info:
            Additional = test_info
        else:
            Additional = None
            
        url = apt
            
    except:
        continue
    content = [ID, Location, Region, District, Street, Price, Views, Area, Rooms, Floor, Storeys, Bathrooms,
               BuildType, CeilingHeight, Condition, DateAdded, DateEdited, DateScraped, Latitude, Longitude,
               Address, Facilities, Additional, url]
    append_list_as_row_list('apt.rent.0805.csv', content)

  0%|          | 0/6282 [00:00<?, ?it/s]

In [41]:
df = pd.read_csv('apt.rent.0805.csv')
df.head()

Unnamed: 0,ID,Location,Region,District,Street,Price,Views,Area,Room,Floor,...,Condition,Date Added,Date Edited,Date Scraped,Latitude,Longitude,Address,Facilities,Additional Info,url
0,84218,"Yerevan, Center, Saryan St",Yerevan,Center,Saryan St,600,2927,75,2,9,...,Good,10.09.2019,14.12.2019,08.05.2022,40.18239,44.505119,"27, Սարյանի փողոց, Կենտրոն, Երևան, 0002, Հայաստան","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Ti...",https://myrealty.am/en/2-bedroom/apartment-for...
1,84217,"Yerevan, Center, Sayat-Nova Ave",Yerevan,Center,Sayat-Nova Ave,950,3033,103,3,7,...,Newly repaired,10.09.2019,06.05.2020,08.05.2022,40.182126,44.512085,"30, Եզնիկ Կողբացու փողոց, Կենտրոն, Երևան, 0002...","['Heating', 'Internet', 'Hot water', 'Sewerage...","['Furniture', 'Equipment', 'Euro windows', 'Ir...",https://myrealty.am/en/3-bedroom/apartment-for...
2,84181,"Yerevan, Center, Amiryan St",Yerevan,Center,Amiryan St,890,2692,75,3,9,...,Newly repaired,10.09.2019,14.12.2019,08.05.2022,40.180622,44.508178,"HAF-HAF, 13/45, Ամիրյան փողոց, Կենտրոն, Երևան,...","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Ti...",https://myrealty.am/en/3-bedroom/apartment-for...
3,84177,"Yerevan, Center, Sayat-Nova Ave",Yerevan,Center,Sayat-Nova Ave,950,2649,103,3,7,...,Newly repaired,10.09.2019,14.12.2019,08.05.2022,40.183568,44.520496,"21, Սայաթ-Նովայի պողոտա, Կենտրոն, Երևան, 0001,...","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Ti...",https://myrealty.am/en/3-bedroom/apartment-for...
4,84167,"Yerevan, Arabkir, Baghramyan av(Arabkir)",Yerevan,Arabkir,Baghramyan av(Arabkir),1300,3007,120,3,4,...,Newly repaired,10.09.2019,09.07.2020,08.05.2022,40.193848,44.497105,"59, Մարշալ Բաղրամյան պողոտա, Արաբկիր, Երևան, 0...","['Heating', 'Internet', 'Hot water', 'Sewerage...","['Furniture', 'Equipment', 'Euro windows', 'Ir...",https://myrealty.am/en/3-bedroom/apartment-for...
5,102339,"Yerevan, Qanaqer-Zeytun, Rubinyants St",Yerevan,Qanaqer-Zeytun,Rubinyants St,421,1749,74,3,3,...,Good,10.09.2019,07.09.2020,08.05.2022,40.203145,44.548,"22, Ռուբինյանց փողոց, Քանաքեռ-Զեյթուն, Երևան, ...","['Heating', 'Internet', 'Sewerage, Canalizatio...","['Furniture', 'Equipment', 'Euro windows', 'Ir...",https://myrealty.am/en/3-bedroom/apartment-for...
6,84098,"Yerevan, Center, A.Manukyan St",Yerevan,Center,A.Manukyan St,3000,3272,130,4,3,...,Newly repaired,09.09.2019,01.05.2020,08.05.2022,40.175329,44.522743,"HSBC, 10, Ալեք Մանուկյանի փողոց, Կենտրոն, Երևա...","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Ir...",https://myrealty.am/en/4-bedroom/apartment-for...
7,84094,"Yerevan, Center, Charents St",Yerevan,Center,Charents St,1000,2248,93,3,2,...,Newly repaired,09.09.2019,16.12.2019,08.05.2022,40.179481,44.527349,"28, Չարենցի փողոց, Կենտրոն, Երևան, 0025, Հայաստան","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Ir...",https://myrealty.am/en/3-bedroom/apartment-for...
8,84088,"Yerevan, Center, Argishti St",Yerevan,Center,Argishti St,400,4306,55,1,11,...,Good,09.09.2019,19.03.2020,08.05.2022,40.173982,44.504226,"7, Արգիշտի փողոց, Կենտրոն, Երևան, 0015, Հայաստան","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Op...",https://myrealty.am/en/1-bedroom/apartment-for...
9,84085,"Yerevan, Arabkir, Orbeli Yeghbayrner St",Yerevan,Arabkir,Orbeli Yeghbayrner St,400,2376,60,3,3,...,Good,09.09.2019,13.01.2020,08.05.2022,40.192974,44.49234,"33, Օրբելի եղբայրների փողոց, Արաբկիր, Երևան, 0...","['Heating', 'Internet', 'Hot water', 'Electric...","['Furniture', 'Equipment', 'Euro windows', 'Ti...",https://myrealty.am/en/3-bedroom/apartment-for...
