In [1]:
# loading packages

# utils
import pandas as pd
import pickle
import time

# scraping
from urllib.request import urlopen, Request
from urllib.error import HTTPError, URLError
import requests
from bs4 import BeautifulSoup

`airlinequality.com` had a blocker for the default `urllib` agent, so this workaround was found in order to correctly scrape the reviews.

Source:
https://stackoverflow.com/questions/16627227/http-error-403-in-python-3-web-scraping

Enable some default error handling in case the site cannot be accessed, and tell us why.

In [2]:
def sneaky_request(url):
    """
    sneaky_request is a function designed to get around some pages blocking web scraping.
    It uses a different User-Agent than the default `python urllib/3.X.X`
    
    Args:
        url (str) : url of the website desired to be scraped
    
    Return:
        open_url (HTTPResponse) : the HTTP response of the input URL
    """
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        open_url = urlopen(req)
    except HTTPError as error:
        print("Error code: ", error.code)
        print("The reason for the exception:", error.reason)
    
    return open_url

In [3]:
gw_reviews_url = sneaky_request("https://www.airlinequality.com/airline-reviews/germanwings/")

In [4]:
type(gw_reviews_url)

http.client.HTTPResponse

Let's double check to ensure that this has gone correctly.

In [5]:
print(gw_reviews_url.geturl())
print("Status:",gw_reviews_url.reason)

https://www.airlinequality.com/airline-reviews/germanwings/
Status: OK


Use `BeautifulSoup` to explore and scrape the pages for the relevant info.

In [6]:
gw_reviews = BeautifulSoup(gw_reviews_url.read())

We need to traverse all of the pages in order to extract all of the reviews; this means opening each subsequent page and extracting each review.

The following `while` loop iterates over each subsequent review page, terminating when there are no further pages to scrape.  The airline review information is stored in `reviews`, to be parsed after extracting all of the information.

In [7]:
# create the initial list and condition to keep scraping
reviews = []
keep_going = True

while keep_going:
    
    # store the customer reviews in a list for later parsing
    if len(reviews) == 0:
        # if it is the first page, create the list
        reviews = gw_reviews.find_all("article", {"itemprop" : "review"})
    else:
        # concatenate the next pages reviews
        for review in gw_reviews.find_all("article", {"itemprop" : "review"}):
            reviews.append(review)
    
    # find the next page tag, use it to construct the next page to access
    # if it is the last page, end the loop
    try:
        next_page = gw_reviews.find("a", string = ">>")["href"]
        next_page_url = "https://www.airlinequality.com" + next_page
    except: 
        keep_going = False
        
    # open the next page, but wait 5 seconds to be polite 
    # and not overload the server
    time.sleep(5)
    gw_reviews_url = sneaky_request(next_page_url)
    gw_reviews = BeautifulSoup(gw_reviews_url.read())


Now that the reviews are all extracted, construct a `pandas` dataframe with desired information.

First, double check that all `146` reviews are present.

In [8]:
len(reviews)

146

Iterate through the reviews, building lists of the required information.

Note that this could be done in parallel using a library such as [`joblib`](https://joblib.readthedocs.io/en/latest/), but the dataset is so small that there is no need to do it.

In [None]:
# define a function to help us later

def safe_extract(dict_obj, extracted_tag, , replacement_value = None):
    """
    safe_extract grabs the
    """
    try:
        dict_obj["review_value"].append(extracted_tag.text)
    except:
        dict_obj["review_value"].append(None)

In [35]:
# build a dictionary structure for easily converting to a pandas dataframe
parsed_reviews = {
    "title" : [],
    "review_value" : [],
    "n_user_reviews" : [],
    "reviewer_name" : [],
    "reviewer_country" : [],
    "date_of_review" : [],
    "review_text" : [],
    "aircraft" :[],
    "traveller_type" : [],
    "seat_type" : [],
    "route" : [],
    "date_flown" : [],
    "seat_comfort_rating" : [],
    "cabin_staff_service_rating" : [],
    "inflight_entertainment_rating" : [],
    "ground_service_rating" : [],
    "value_for_money_raying" : [],
    "recommendation" : []
}


# iterate through all reviews, extracting information from each
# and storing in the parsed_reviews dict
for review in reviews:

    # extract review title
    review_title = review.find("h2", {"class" : "text_header"})
    parsed_reviews["title"].append(review_title.text)

    # extract review value out of 10
    review_value = review.find("span", {"itemprop" : "ratingValue"})
    # if there is no value out of 10, enter None instead
    try:
        parsed_reviews["review_value"].append(review_value.text)
    except:
        parsed_reviews["review_value"].append(None)

    # extract number of reviews by the reviewer
    n_reviews = review.find("span", {"class" : "userStatusReviewCount"})
    
    parsed_reviews["n_user_reviews"].append(n_reviews.text)
    
    # extract the reviewer

    # extract the country of the reviewer

    # extract the date of the review

    # extract the review text

    # extract the aircraft

    # extract the type of traveller

    # extract seat type

    # extract the route

    # extract the date flown

    # extract the seat comfort rating out of 5

    # extract the cabin staff service rating out of 5

    # extract the inflight entertainment rating out of 5

    # extract the ground service rating out of 5

    # extract the value for money rating out of 5

    # extract if the review recommended Germanwings or not

AttributeError: 'NoneType' object has no attribute 'text'

In [34]:
parsed_reviews

{'title': ['"Seat was fine with enough legroom"',
  '"crew were smiling and good"',
  '"only two agents available"',
  '"good flight and friendly staff"',
  '"never been treated as badly"',
  '"very cramped"',
  '"staff were friendly"',
  'PLEASE REFER TO EUROWINGS',
  '"great value for money"',
  '"pay very little money"',
  '"friendly and professional"',
  '"little choice of airline"',
  '"were very satisfied"',
  '"no info or staff"',
  '"cabin crew were very friendly"',
  '"professional cabin crew"',
  '"friendly cabin staff"',
  '"they have been unresponsive"',
  '"impressed with Germanwings"',
  '"I was refused some water"',
  '"the service crew were friendly but nothing more"',
  '"no problems or delays, very friendly staff"',
  '"I don\'t get why Germanwings is always late"',
  '"vowing never to book Germanwings again"',
  '"staff friendly - food terrible"',
  '"no explanations, no offers of help"',
  '"delay in Heathrow about 1:30 minutes"',
  'Germanwings customer review',
  

In [9]:
print(reviews[0].prettify())

<article class="comp comp_media-review-rated list-item media position-content review-667978" itemprop="review" itemscope="" itemtype="http://schema.org/Review">
 <meta content="2019-07-01" itemprop="datePublished"/>
 <div class="rating-10" itemprop="reviewRating" itemscope="" itemtype="http://schema.org/Rating">
  <span itemprop="ratingValue">
   7
  </span>
  /
  <span itemprop="bestRating">
   10
  </span>
 </div>
 <div class="body" id="anchor667978">
  <h2 class="text_header">
   "Seat was fine with enough legroom"
  </h2>
  <h3 class="text_sub_header userStatusWrapper">
   <span itemprop="author" itemscope="" itemtype="http://schema.org/Person">
    <a class="userStatus hiflyer" href="/reviewer/reviewer-signup">
     <span class="userStatusReviewCount">
      8 reviews
     </span>
     <svg class="userStatusIcon" version="1.1" viewbox="0 0 307.3 193.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
      <path d="M157.9,95c-13.1,5.5-26.3,11.1-38.5,1

In [10]:
print(reviews[-1].prettify())

<article class="comp comp_media-review-rated list-item media position-content review-252954" itemprop="review" itemscope="" itemtype="http://schema.org/Review">
 <meta content="2008-04-28" itemprop="datePublished"/>
 <div class="rating-10" itemprop="reviewRating" itemscope="" itemtype="http://schema.org/Rating">
  <span itemprop="ratingValue">
   3
  </span>
  /
  <span itemprop="bestRating">
   10
  </span>
 </div>
 <div class="body" id="anchor252954">
  <h2 class="text_header">
   Germanwings customer review
  </h2>
  <h3 class="text_sub_header userStatusWrapper">
   <span itemprop="author" itemscope="" itemtype="http://schema.org/Person">
    <span itemprop="name">
     P Gatineau
    </span>
   </span>
   <time datetime="2008-04-28" itemprop="datePublished">
    28th April 2008
   </time>
  </h3>
  <div class="tc_mobile">
   <div class="text_content" itemprop="reviewBody">
    STR-STN return. Another great flight with Germanwings. They do allocate seats now so you can ensure alread