In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
def get_soup(url):
  '''Return html content of a url'''
  # defining the headers
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
  r = requests.get(url, allow_redirects=False, headers=headers)
  html_content = r.content
  soup = BeautifulSoup(html_content, 'html.parser')

  return soup

In [3]:
def get_reviews(soup):
  '''Return a dataframe containing extracted elements'''
  reviews = []

  # extracting all the elements needed and passing them to lists
  review_text = []
  for x in soup.find_all('p', {'class': 'partial_entry'}):
    review_text.append(x.text.strip())

  review_rating = []
  for y in soup.find_all('span', {'class': 'ui_bubble_rating'}):
    review_rating.append(y['class'][1].replace('bubble_', ''))

  review_title = []
  for z in soup.find_all('span', {'class': 'noQuotes'}):
    review_title.append(z.text.strip())

  visit_date = []
  for a in soup.find_all('div', {'class':'prw_rup prw_reviews_stay_date_hsx'}):
    visit_date.append(a.text[15:].strip())

  author_name = []
  for b in soup.find_all('div', {'class': 'info_text pointer_cursor'}):
    author_name.append(b.text.strip())

  overall_rating = []
  for c in range(len(review_rating)):
    overall_rating.append(soup.find('div', {'class': 'QEQvp'}).find('span', {'class': 'ZDEqb'}).text.strip())

  # for cases where there are no review texts or titles
  if ((len(overall_rating) != len(visit_date)) | (len(review_text) != len(review_rating)) | (len(review_rating) != len(visit_date)) | (len(author_name) != len(visit_date))):
    min_len = min(len(review_text), len(review_rating), len(review_title), len(visit_date), len(author_name), len(overall_rating))
    print(min_len)
    review_text = review_text[:min_len]
    review_rating = review_rating[:min_len]
    review_title = review_title[:min_len]
    visit_date = visit_date[:min_len]
    author_name = author_name[:min_len]
    overall_rating = overall_rating[:min_len]

  # inserting all necessary elements into a dataframe
  df = pd.DataFrame({'author_name' : author_name,
                     'review_title' : review_title,
                     'review_text' : review_text,
                     'author_rating' : review_rating,
                     'visit_date' : visit_date,
                     'overall_rating' : overall_rating})

  return df

In [6]:
def scrape_reviews(url, total_reviews):
  '''Extracts the needed number of review elements specified'''

  final_df = pd.DataFrame()
  soup = get_soup(url)
  final_df = get_reviews(soup)

  while len(final_df) < total_reviews:
    try:
      next_url = 'https://www.tripadvisor.com/' + (soup.find('a', class_='nav next ui_button primary').get('href'))
      next_soup = get_soup(next_url)
      df = get_reviews(next_soup)
      final_df = pd.concat([final_df, df])
      soup = next_soup
    except AttributeError:
      break

  return final_df

In [7]:
# Define the URL of the TripAdvisor page
url = "https://www.tripadvisor.com/Restaurant_Review-g304026-d12063614-Reviews-Shiro_Restaurant_Bar-Lagos_Lagos_State.html"

# scrape reviews
df = scrape_reviews(url, 555)

15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
7


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 547 entries, 0 to 6
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author_name     547 non-null    object
 1   review_title    547 non-null    object
 2   review_text     547 non-null    object
 3   author_rating   547 non-null    object
 4   visit_date      547 non-null    object
 5   overall_rating  547 non-null    object
dtypes: object(6)
memory usage: 29.9+ KB
