#### 1. Loading the Libraries and WebDriver

In [5]:
import os
import time
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver

browser = webdriver.Chrome('/Users/ashishzangra/opt/anaconda3/lib/python3.9/site-packages/chromedriver_binary/chromedriver')

#### 2. Opening the Website

In [None]:
browser.get('https://www.airbnb.co.in/')
time.sleep(5)

#### 3. Defining the Categories

In [None]:
prefix = 'https://www.airbnb.co.in/?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&search_mode=flex_destinations_search&flexible_trip_lengths%5B%5D=one_week&location_search=MIN_MAP_BOUNDS&search_type=category_change&category_tag=Tag%'

cateogories = { 'OMG!'             : '3A8225', 'National parks'     : '3A8102', 'Tiny homes'        : '3A8186',
                'Islands'          : '3A675' , 'Camping'            : '3A634' , 'Cabins'            : '3A5348',
                'Design'           : '3A8528', 'Arctic'             : '3A8534', 'Amazing pools'     : '3A677' ,
                'Lakefront'        : '3A8522', 'Surfing'            : '3A8526', 'A-frames'          : '3A8148', 
                'Treehouses'       : '3A8188', 'Tropical'           : '3A5635', 'Bed & breakfasts'  : '3A8538', 
                'Caves'            : '3A670' , 'Shared homes'       : '3A8542', 'Earth homes'       : '3A8174', 
                'Farms'            : '3A8175', 'Countryside'        : '3A4104', 'Luxe'              : '3A8253', 
                'Golfing'          : 'A8525' , 'Amazing views'      : '3A8536', 'Castles'           : '3A8047', 
                'Iconic cities'    : '3A8535', 'Historical homes'   : 'A8524' , 'Mansions'          : '3A8115', 
                'Beaches'          : '3A7769', 'Cycladic homes'     : '3A8227', 'Domes'             : '3A8173', 
                'Campervans'       : '3A8166', "Chef's kitchens"    : '3A5731', 'Windmills'         : '3A8043', 
                'Vineyards'        : '3A8101', 'Casas particulares' : '3A8232', 'Off-the-grid'      : '3A8226', 
                'Skiing'           : '3A7765', 'Minsus'             : '3A8230', 'Yurts'             : '3A8192', 
                'Desert'           : '3A8099', 'Ryokans'            : '3A8228', 'Towers'            : '3A8187', 
                "Shepherd's huts"  : '3A8650', 'Barns'              : '3A8159', 'Houseboats'        : '3A8176', 
                'Boats'            : '3A1073', 'Beachfront'         : '3A789' , 'Containers'        : '3A8157', 
                'Grand pianos'     : '3A8521', 'Creative spaces'    : '3A8543', 'Trulli'            : '3A8229', 
                'Riads'            : '3A8255', 'Dammusos'           : '3A8256', 'Ski-in-Ski-out'    : '3A5366', 
                'Lake'             : '3A8144'}

print(len(cateogories))

#### 4. Scraping the Data

In [None]:
for key in tqdm(list(cateogories.keys())):
    
    link = prefix + cateogories[key]

    # Send Request on Each Link
    browser.get(link)
    time.sleep(5)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    

    # Reach the bottom of the Page
    i = 0
    no_of_rest_c = 1
    no_of_rest_o = 0
    while True:
        browser.execute_script("window.scrollTo(0," + str(i) + ")")
        i += 100
        time.sleep(.2)
        if (i%10000 == 0):
            soup = BeautifulSoup(browser.page_source, 'html.parser')
            no_of_rest_c = len(soup.find_all('div', class_ = 'c4mnd7m dir dir-ltr'))
            if (no_of_rest_o == no_of_rest_c):
                break
            no_of_rest_o = no_of_rest_c
            
            
    # Scraping the Details of the stay        
    data = []
    for sp in soup.find_all('div', class_ = 'c4mnd7m dir dir-ltr'):
        try:
            img_link = sp.find('img').get('src')
        except:
            img_link = np.nan
        try:
            id_      = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('div')[0].get('id').strip()[6:]
        except:
            id_      = np.nan
        try:
            name     = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('div')[0].text.strip()
        except:
            name     = np.nan
        try:
            price    = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('span')[7].text.strip()
        except:
            price    = np.nan
        try:
            rating   = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('span')[-1].text.strip()
        except:
            rating   = np.nan
        data.append([id_, name, price, rating, img_link])
    
    # Saving the Dataset
    df = pd.DataFrame(data, columns = ['id', 'name','price','rating','img_link'])
    df.to_csv(key + '.csv', index = False)

#### 5. Finding the Incomplete Datasets

In [None]:
for file in os.listdir('Datasets'):
    
    if '.csv' in file:                                    # Verifying the CSV Files
        
        df_ = pd.read_csv('Datasets/' + file)             # Loading each CSV file one by one
        
        name = file.split('.')[0]                         # Taking the name of the CSV file
        
        if (len(df_) == 0 or df_.isnull().sum()[0]):      # Need to Scraper Again
            print(name, '<!!!>')
            
        else:                                             # Scraped
            del cateogories[name]                         # Removing the Pairs whose data is Scraped
        
print(len(cateogories))

#### 6. Scraping Incomplete Data

In [None]:
for key in tqdm(list(cateogories.keys())):
    
    link = prefix + cateogories[key]

    # Send Request on Each Link
    browser.get(link)
    time.sleep(5)
    soup = BeautifulSoup(browser.page_source, 'html.parser')


    # Reach the bottom of the Page
    i = 0
    no_of_rest_c = 1
    no_of_rest_o = 0
    while True:
        browser.execute_script("window.scrollTo(0," + str(i) + ")")
        i += 100
        time.sleep(.2)
        if (i%10000 == 0):
            soup = BeautifulSoup(browser.page_source, 'html.parser')
            no_of_rest_c = len(soup.find_all('div', class_ = 'c4mnd7m dir dir-ltr'))
            if (no_of_rest_o == no_of_rest_c):
                break
            no_of_rest_o = no_of_rest_c
            
            
    # Scraping the Details of the stay        
    data = []
    for sp in soup.find_all('div', class_ = 'c4mnd7m dir dir-ltr'):
        try:
            img_link = sp.find('img').get('src')
        except:
            img_link = np.nan
        try:
            id_      = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('div')[0].get('id').strip()[6:]
        except:
            id_      = np.nan
        try:
            name     = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('div')[0].text.strip()
        except:
            name     = np.nan
        try:
            price    = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('span')[7].text.strip()
        except:
            price    = np.nan
        try:
            rating   = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('span')[-1].text.strip()
        except:
            rating   = np.nan
        data.append([id_, name, price, rating, img_link])
    
    # Saving the Dataset
    df = pd.DataFrame(data, columns = ['id', 'name','price','rating','img_link'])
    df.to_csv('Datasets/' + key + '.csv', index = False)

#### 7. Automatic Scraper untill the whole data is scraped

In [None]:
while (len(cateogories) != 0):

    # 1. Finding the Files that need to be scraped Again

    for file in os.listdir('Datasets'):   
        if '.csv' in file:                                    # Verifying the CSV Files
            df_ = pd.read_csv('Datasets/' + file)             # Loading each CSV file one by one 
            name = file.split('.')[0]                         # Taking the name of the CSV file
            if (len(df_) == 0 or df_.isnull().sum()[0]):      # Need to Scraper Again
                pass
#                 print(name, '<!!!>')  
            else:                                             # Scraped
                try:
                    del cateogories[name]                         # Removing the Pairs whose data is Scraped
                except:
                    pass


    # 2. Scraping the Incomplete Data Again            

    for key in tqdm(list(cateogories.keys())):

        link = prefix + cateogories[key]

        # Send Request on Each Link
        browser.get(link)
        time.sleep(5)
        soup = BeautifulSoup(browser.page_source, 'html.parser')


        # Reach the bottom of the Page
        i = 0
        no_of_rest_c = 1
        no_of_rest_o = 0
        while True:
            browser.execute_script("window.scrollTo(0," + str(i) + ")")
            i += 100
            time.sleep(.2)
            if (i%10000 == 0):
                soup = BeautifulSoup(browser.page_source, 'html.parser')
                no_of_rest_c = len(soup.find_all('div', class_ = 'c4mnd7m dir dir-ltr'))
                if (no_of_rest_o == no_of_rest_c):
                    break
                no_of_rest_o = no_of_rest_c


        # Scraping the Details of the stay        
        data = []
        for sp in soup.find_all('div', class_ = 'c4mnd7m dir dir-ltr'):
            try:
                img_link = sp.find('img').get('src')
            except:
                img_link = np.nan
            try:
                id_      = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('div')[0].get('id').strip()[6:]
            except:
                id_      = np.nan
            try:
                name     = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('div')[0].text.strip()
            except:
                name     = np.nan
            try:
                price    = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('span')[7].text.strip()
            except:
                price    = np.nan
            try:
                rating   = sp.find('div', class_ = 'g1tup9az cb4nyux dir dir-ltr').find_all('span')[-1].text.strip()
            except:
                rating   = np.nan
            data.append([id_, name, price, rating, img_link])

        # Saving the Dataset
        df = pd.DataFrame(data, columns = ['id', 'name','price','rating','img_link'])
        df.to_csv('Datasets/' + key + '.csv', index = False)

### 8. Data Combining

In [10]:
df_master = pd.DataFrame()

for file in os.listdir('Datasets/'):
    
    if ('csv' in file):
        
        df_ = pd.read_csv('Datasets/' + file)
        df_['category'] = file.split('.')[0]
        
        df_master = pd.concat((df_master, df_))

### 9. Saving the Dataset

In [11]:
df_master.to_csv('data.csv', index = False)