In [1]:
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing
from bs4 import BeautifulSoup

### 1. Reading the Dataset

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,id,name,price,rating,img_link,category
0,49849504,"Kartepe, Turkey","₹8,078 per night",4.71,https://a0.muscache.com/im/pictures/cf7f3f57-8...,A-frames
1,50891766,"Kaş, Turkey","₹4,665 per night",New,https://a0.muscache.com/im/pictures/449c8751-0...,A-frames
2,50699164,"Imereti, Georgia","₹5,991 per night",4.85,https://a0.muscache.com/im/pictures/miso/Hosti...,A-frames
3,49871422,"Sapanca, Turkey","₹11,339 per night",5.0,https://a0.muscache.com/im/pictures/72e6396e-e...,A-frames
4,51245886,"Sapanca, Turkey","₹6,673 per night",New,https://a0.muscache.com/im/pictures/73973308-e...,A-frames


### 2. Defining Download Function

In [3]:
def download(st,en, checkpoint):
  data = []
  prefix = 'https://www.airbnb.co.in/rooms/'

  for id_ in tqdm(df['id'][st:en]): 
      link = prefix + str(id_)
      res = requests.get(link)
      soup = BeautifulSoup(res.content, 'html.parser')

      try:
          name    = soup.find('h1').text.strip()
      except:
          name    = np.nan
      try:
          rating  = soup.find_all('span', class_ = '_1jvg42j')[0].find_all('span')[-2].text.strip().split(' ')[0]
      except:
          rating  = np.nan
      try:
          reviews = soup.find_all('span', class_ = '_1jvg42j')[0].find_all('span')[-1].find('button').text.strip().split(' ')[0]
      except:
          reviews = np.nan
      try:
          address = soup.find_all('span', class_ = '_1jvg42j')[-1].text.strip()
      except:
          address = np.nan
      try:
          img_links = ' '.join([sp.get('src') for sp in soup.find('div', class_ = '_88xxct').find_all('img')])
      except:
          img_links = np.nan
      try:
          host_name = soup.find('h2').text.strip()
      except:
          host_name = np.nan
      try:
          features = []
          for sp in soup.find('ol', class_ = 'lgx66tx dir dir-ltr').find_all('li', class_ = 'l7n4lsf dir dir-ltr'):
              for i in sp.text.strip().split('·'):
                  if (i != ''):
                      features.append(i.strip())
          features = ','.join(features)
      except:
          features = np.nan
      try:
          host_id = soup.find('div', class_ = 'c6y5den dir dir-ltr').find('a').get('href').split('/')[-1]    
      except:
          host_id = np.nan
      try:    
          house_rules  = ','.join([sp.text for sp in soup.find_all('div', class_ = 'cihcm8w dir dir-ltr')[0].find_all('span')])
          safety_rules = ','.join([sp.text for sp in soup.find_all('div', class_ = 'cihcm8w dir dir-ltr')[1].find_all('span')])
      except:
          house_rules  = np.nan
          safety_rules = np.nan
      try:
          amenities    = ','.join([sp.text for sp in soup.find('div', class_ = '_1byskwn').find_all('div', class_ = 'iikjzje i10xc1ab dir dir-ltr')])
      except:
          amenities    = np.nan

      data.append([id_ ,name, rating, reviews, host_name, host_id ,address, features, amenities, safety_rules, house_rules, img_links])
      
  data = pd.DataFrame(data, columns = ['id', 'name','rating','reviews','host_name','host_id','address',
                                      'features','amenities','safety_rules','hourse_rules','img_links'])
  
  data['price']  = df['price'][st:en]
  data['rating'] = df['rating'][st:en]

  data.to_csv('checkpoint/' +  str(checkpoint) + '.csv')

### 3. Scrape with MultiProcessing

In [4]:
def processing (no_of_processes, df):

  process = []
  temp    = []
  pairs   = []

  if (len(df)%no_of_processes == 0):         # If its fully Divisible
    for i in range(no_of_processes + 1):
      temp.append(int((len(df)/no_of_processes) * i))

    for i in range(1,len(temp)):
      start = temp[i-1]
      end = temp[i]
      pairs.append([start,end])


    for i in range(len(pairs)):
      start = pairs[i][0]
      end   = pairs[i][1]

      checkpoint_name = 'ck_' + str(i)
      process.append(multiprocessing.Process(target = download, args = (start , end  , checkpoint_name))) 

  for pr in process:
    pr.start()

  for pr in process:
    pr.join()

In [5]:
processing( 8 , df )

100%|██████████| 1807/1807 [08:11<00:00,  3.68it/s]
 97%|█████████▋| 1752/1807 [08:11<00:10,  5.19it/s]Process Process-1:
 96%|█████████▌| 1733/1807 [08:11<00:15,  4.88it/s]Traceback (most recent call last):
 97%|█████████▋| 1753/1807 [08:11<00:09,  5.59it/s]  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
 94%|█████████▍| 1699/1807 [08:11<00:17,  6.14it/s]  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-3-dcc8c1070dc9>", line 66, in download
    data.to_csv('checkpoint/' +  str(checkpoint) + '.csv')
  File "/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py", line 3482, in to_csv
    storage_options=storage_options,
  File "/usr/local/lib/python3.7/dist-packages/pandas/io/formats/format.py", line 1105, in to_csv
    csv_formatter.save()
  File "/usr/local/lib/python3.7/dist-packages/pandas/io/formats/csvs.py", line 243, in save
    storag