In [1]:
# Libraries
import bs4
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import re
import pandas as pd

In [2]:
# A helper function to deal with empty elements when extracting the first element of a list
def helper(x):
    if len(x)==0:
        return None
    else:
        return x[0]
    
# A helper function to output a soup object using url
def return_soup(url):
    uClient = uReq(url)
    html = uClient.read()
    res_soup = BeautifulSoup(html, "html.parser")
    
    return res_soup

# A helper function to extract amenities
# Input: a list of strings, a word
# Output: a list of boolean values denoting whether each entry has that word
# Eg helper_amen(['a','ab','c'], 'a') = [True, True, False]
def helper_amen(li, word):
    res = [word in str(s) for s in li]
    return res

In [52]:
# A helper function to read PLANE-specific pages
def from_plane_soup(soup):
    
    # Find the comment section (return nothing if there is no comment)
    try:
        soup_comment_section = soup.find(class_="comment-box")
        
        # Extract html code for dates, names, comments, plane names
        name_plane = soup.findAll(class_="h1-fix")[0].get_text()
        dates_seats = soup_comment_section.findAll(class_="date")
        names = soup_comment_section.findAll(class_="name")
        comments = soup_comment_section.findAll(class_="comment")
    except:
        return None
    
    # Clearn up what we have above
    dates = [helper(re.findall('[0-9]{4}\/[0-9]{2}\/[0-9]{2}', s.get_text())) for s in dates_seats]
    seats = [helper(re.findall('Seat ([A-Z0-9]*) ', s.get_text())) for s in dates_seats]
    names = [helper([s.get_text().strip()]) for s in names]
    comments = [helper([s.get_text().strip()]) for s in comments]

    # Put everything into a dataframe
    df = pd.DataFrame({'Date':dates,
                       'Plane':name_plane,
                       'Seat':seats, 
                       'Name':names,
                       'Comment':comments})
    return df

In [4]:
# A helper function to read COMPANY-specific pages
def from_company_soup(soup):
    # Extract html code for plane categories, amenities and urls
    categories = soup.findAll(class_="chartsTitle")[1:]
    planes = soup.findAll(class_="seats")

    # Clearn up what we have above
    categories = [s.find("h3").get_text()[3:] for s in categories]
    urls = [s.findAll(class_="aircraft_seats") for s in planes]
    urls = [[s2.find('a', href=True)['href'] for s2 in s ] for s in urls]
    amenities = [s.findAll(id='amenities_list') for s in planes]

    # Put everything into a dataframe
    df = pd.DataFrame({'Url':urls,
                       'Category':categories})
    df = df.explode('Url')

    # Deal with amenities
    df['temp'] = [b for a in amenities for b in a]
    for amen in ['food', 'wifi', 'tv', 'headphones', 'elec']:
        df[amen.capitalize()] = helper_amen(df['temp'].tolist(), amen)
    df = df.drop('temp', axis=1)
    
    return df

In [5]:
# A helper function to read the base page where all companies are listed
def from_base_soup(soup):
    # Find the airline section
    soup_airline_section = soup.findAll(class_="browseAirlines")[0]

    # Find urls
    urls = soup_airline_section.findAll('a', href=True)

    # Clearn up what we have above
    urls = [s['href'] for s in urls]
    
    return urls

# Begin the scrapping process

In [None]:
# Read the base page
url = 'https://www.seatguru.com/browseairlines/browseairlines.php'
list_urls_companies = from_base_soup(return_soup(url))

In [None]:
# Iterate through each company
res = []
for url_company in list_urls_companies:
    url_company = 'https://www.seatguru.com' + url_company
    res.append(from_company_soup(return_soup(url_company)))
    
    # Tracker
    if len(res) % 5 == 0:
        print(len(res), 'companies collected')
    
df_company = pd.concat(res,axis=0,ignore_index=True)
# df_company.to_pickle('df_company')

In [60]:
# Iterate through each plane for each company

# A helper function to handle the map
def helper_map(url):
    url_plane = 'https://www.seatguru.com' + url
    df = from_plane_soup(return_soup(url_plane))
    
    # Return nothing if there is no comment
    if df is None:
        return None
    else:
        df['Url'] = url
        return df

df_company = pd.read_pickle('df_company')
res = df_company['Url'].map(helper_map)
df_plane = pd.concat(res.to_list(),axis=0,ignore_index=True)
# df_plane.to_pickle('df_plane')

In [80]:
# Join the two dfs together!
df_company = pd.read_pickle('df_company')
df_plane = pd.read_pickle('df_plane')
df = df_plane.merge(df_company, on=['Url'])
#df.to_pickle('df_seatguru')

In [83]:
df

Unnamed: 0,Date,Plane,Seat,Name,Comment,Url,Category,Food,Wifi,Tv,Headphones,Elec
0,2019/12/29,Airbus A319 (319),3C,Abdallah E,From Istabul to Athens.\r\nSeat 3C doesnt have...,/airlines/Aegean_Airlines/Aegean_Airlines_Airb...,Narrowbody Jets,True,False,False,False,False
1,2018/06/09,Airbus A320 (320),,Stathis P,I traveled Athens-Zurich with Aegean Airlines ...,/airlines/Aegean_Airlines/Aegean_Airlines_Airb...,Narrowbody Jets,True,False,True,True,False
2,2018/05/25,Airbus A320 (320),15C,SeatGuru User,very tiny leg room as I am quite tall. flight ...,/airlines/Aegean_Airlines/Aegean_Airlines_Airb...,Narrowbody Jets,True,False,True,True,False
3,2018/04/19,Airbus A320 (320),12F,Bryan M,Exit row 12F was nice for the 2.5 hour ZRH-ATH...,/airlines/Aegean_Airlines/Aegean_Airlines_Airb...,Narrowbody Jets,True,False,True,True,False
4,2017/07/22,Airbus A320 (320),12C,SeatGuru User,It is a wonderful seat at no additional cost (...,/airlines/Aegean_Airlines/Aegean_Airlines_Airb...,Narrowbody Jets,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
17692,2017/02/18,Airbus A321 (321),10F,Dominika S,There's only 1 half window for this sest (wind...,/airlines/Wizzair/Wizz_Air_Airbus_A321.php,Narrowbody Jets,True,False,False,False,False
17693,2016/09/14,Airbus A321 (321),25B,SeatGuru User,"None of the seats recline, however, this is a ...",/airlines/Wizzair/Wizz_Air_Airbus_A321.php,Narrowbody Jets,True,False,False,False,False
17694,2017/08/15,Boeing 787-8 (788),,SeatGuru User,Rather comfy despite it being listed at 17 inc...,/airlines/Xiamen_Airlines/Xiamen_Airlines_Boei...,Widebody Jets,True,True,True,True,True
17695,2019/06/16,Boeing 787-9 (789),1A,Abdallah E,I was very satisfied with my flight. The Fligh...,/airlines/Xiamen_Airlines/Xiamen_Airlines_Boei...,Widebody Jets,True,True,True,True,True
