In [9]:
# import re
# import requests
# from bs4 import BeautifulSoup
# from collections import ChainMap 


# def extract(page=1, page_size=50):
#     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
#     url = f"https://www.airlinequality.com/airline-reviews/kenya-airways/page/{page}/?sortby=post_date%3ADesc&pagesize={page_size}"
    
#     response = requests.get(url, headers)
#     return response.status_code
        
    
# print(extract(page=0, page_size=200))

In [10]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [11]:
def extract(page):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
    url = f"https://www.airlinequality.com/airline-reviews/kenya-airways/page/{page}/?sortby=post_date%3ADesc&pagesize=100"
    r = requests.get(url, headers)
    soup = BeautifulSoup(r.content, 'lxml')
    return soup


review_list = []    
def transform(soup):
    articles = soup.find_all('article', itemprop="review")
    # print(articles[0])

    for article in articles:
        # Get date when article was published
        try:
            date_published = article.find('meta', itemprop="datePublished").get('content', '')
        except:
            date_published = 'NA'

        # Get summary title overview
        try:
            summary_title = article.find("h2", class_="text_header").text
        except:
            summary_title = 'NA'
            
        # Get country of origin
        try:
            countries = article.find("h3", class_="text_sub_header userStatusWrapper")
            country = re.search(r'\((.*?)\)', countries.text).group(1)
        except:
            country = 'NA'

        # Get trip verification details
        try:
            ver_status = article.find('div', class_='text_content', itemprop='reviewBody')
            ver_pattern = r'(Trip Verified|Not Verified)'
            verification_status = re.search(ver_pattern, ver_status.get_text(strip=True)).group()
        except:
            verification_status = 'NA'
            
        # Get trip reviews
        try:
            reviews = article.find('div', class_='text_content', itemprop='reviewBody').get_text(strip=True)
            ignore_strs = ["✅Trip Verified|", "Not Verified|"]
            for i in ignore_strs:
                if reviews.startswith(i):
                    reviews = reviews[len(i):].strip()
        except:
            reviews = 'NA'
            
            
        # Get clients rating (out of ten)  
        try:
            ratings = article.find('span', itemprop="ratingValue").get_text(strip=True)
        except:
            ratings = 'NA'

        # Get clients opinion about recommending the airlines
        try:
            recommendation = article.find('td', class_='review-rating-header recommended')\
                .find_next('td', class_="review-value").text
        except:
            recommendation = 'NA'
        
        
        general_dict = {
            'date_published': date_published,
            'summary_title': summary_title,
            'country': country,
            'trip_verified': verification_status,
            'review': reviews,
            'ratings_10': ratings,
            'recommend': recommendation
        }
        
        # print(general_dict)
        
        
        # Get details about aircraft, reason for travel,cabin and route  
        class_descriptive = ['aircraft', 'type_of_traveller', 'cabin_flown', 'route']
        details_dict = {}
        
        for i in class_descriptive:
            target_descriptions = article.find('td', {'class': f'review-rating-header {i}'})
            try:    
                next_td = target_descriptions.find_next('td', class_='review-value') if target_descriptions else 'NA'
                details_dict[i] = next_td.get_text(strip=True) if next_td else 'NA'
            except:
                pass
        
        # print(details_dict)


        # Get clients rating about specific issues
        class_stars = ['seat_comfort', 'cabin_staff_service', 'food_and_beverages', 'inflight_entertainment', 
                    'ground_service', 'wifi_and_connectivity', 'value_for_money']
        stars_dict = {}
        
        for i in class_stars:
            target_stars = article.find('td', class_=f'review-rating-header {i}')
            try:
                rating_td = target_stars.find_next('td', class_='review-rating-stars') if target_stars else 'NA'
                stars_dict[i] = len(rating_td.find_all('span', class_='star fill')) if rating_td else 'NA'
                
                # try:
                    # stars_dict[i] = len(rating_td.find_all('span', class_='star fill')) if rating_td else ''
                # except:
                #     pass
        
            except:
                pass
        # print(len(stars_dict))
        
        
        data = general_dict | details_dict | stars_dict
        review_list.append(data)
    return

In [12]:
for i in range(1, 6):
    print(f'Scraping from page {i}')
    raw_data = extract(page=i)
    transform(raw_data)

In [13]:
df = pd.DataFrame(review_list)
df.head()

Unnamed: 0,date_published,summary_title,country,trip_verified,review,ratings_10,recommend,type_of_traveller,cabin_flown,route,seat_comfort,cabin_staff_service,food_and_beverages,inflight_entertainment,ground_service,wifi_and_connectivity,value_for_money,aircraft
0,2023-08-28,"""Dreadful airline""",United Kingdom,Trip Verified,Dreadful airline - after emailing customer ser...,3,no,Business,Business Class,Nairobi to London,1.0,5.0,1.0,1.0,5.0,1.0,1.0,
1,2023-08-26,"""distasteful and shabby treatment2",South Africa,Not Verified,The lady at check-in was very rude and insiste...,2,no,Business,Economy Class,Nairobi to Johannesburg,2.0,2.0,3.0,2.0,1.0,1.0,1.0,
2,2023-08-23,"""avoid Kenya Airways in future""",United Kingdom,Trip Verified,Dismal from start to end. Not all of the staff...,3,no,Family Leisure,Business Class,London to Mauritius via Nairobi,4.0,1.0,1.0,2.0,1.0,1.0,2.0,Boeing 787-8 / 737-800
3,2023-08-22,"""Food was horrible""",United States,Not Verified,Airline was 1 hour late to board passengers an...,1,no,Family Leisure,Economy Class,Nairobi to New York,1.0,4.0,1.0,1.0,1.0,,1.0,
4,2023-08-19,"""really disappointed me""",Uganda,Trip Verified,Kenya Airways has really disappointed me for a...,2,no,Solo Leisure,Economy Class,Nairobi to Entebbe,3.0,3.0,5.0,,4.0,,5.0,


In [14]:
df.shape

(435, 18)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date_published          435 non-null    object 
 1   summary_title           435 non-null    object 
 2   country                 435 non-null    object 
 3   trip_verified           435 non-null    object 
 4   review                  435 non-null    object 
 5   ratings_10              435 non-null    object 
 6   recommend               435 non-null    object 
 7   type_of_traveller       268 non-null    object 
 8   cabin_flown             435 non-null    object 
 9   route                   268 non-null    object 
 10  seat_comfort            394 non-null    float64
 11  cabin_staff_service     393 non-null    float64
 12  food_and_beverages      377 non-null    float64
 13  inflight_entertainment  345 non-null    float64
 14  ground_service          257 non-null    fl

In [9]:
import re

html_string = '<div class="sku-bg"> <span style="color:#000 !important;">ID : </span> RG1101 </div>'

value = re.sub(r'<.*?>', '', html_string).strip()
re.search(r'ID\s*:\s*(\w+)', value).group(1)

print(value)


ID :  RG1101


In [14]:

re.search(r'ID\s*:\s*(\w+)', re.sub(r'<.*?>', '', response.css('div.sku-bg').get()).strip()).group(1)


NameError: name 'response' is not defined

In [15]:
re.search(r'ID\s*:\s*(\w+)', re.sub(r'<.*?>', '', html_code).strip()).group(1)

'RG1101'

In [11]:
import re

html_code = '<div class="sku-bg"> <span style="color:#000 !important;">ID : </span> RG1101 </div>'
re.sub(re.compile(r'<span.*?>.*?</span>|<div.*?>|</div>'), '',html_code).strip()

'RG1101'

In [13]:
re.sub(re.compile(r'<span.*?>.*?</span>|<div.*?>|</div>'), '', html_code).strip()


'RG1101'