<a href="https://colab.research.google.com/github/Chiemela-tech/Data_annotation_and_web_scraping_of_airline_reviews/blob/main/Webscrapping_for_Reviews_on_Trustpilot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import files

In [None]:
# Base URL and total number of pages to scrape
base_url = "https://uk.trustpilot.com/review/www.budgetair.com?page=2"
total_pages = 100

In [None]:
# Initialize an empty DataFrame to store all the data
all_data = pd.DataFrame()

for page in range(1, total_pages + 1):
    url = base_url + str(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract data for the current page
    names = [name.text.strip() for name in soup.find_all('span', {'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'})]
    review_counts = [count.text.strip() for count in soup.find_all('span', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l'})]
    review_titles = [title.text.strip() for title in soup.find_all('h2', {'class': 'typography_heading-s__f7029 typography_appearance-default__AAY17'})]
    review_texts = [text.text.strip() for text in soup.find_all('p', {'class': 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'})]
    dates = [date.text.strip() for date in soup.find_all('p', {'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17'})]
    star_ratings = [img['alt'] for img in soup.find_all('img', {'alt': lambda x: x and 'Rated' in x})]

    # Define a function to fill missing values with 'N/A'
    def fill_missing(data, target_length):
        return data + ['N/A'] * (target_length - len(data))

    # Determine the maximum length from the extracted data
    max_length = max(len(names), len(review_counts), len(review_titles), len(review_texts), len(dates), len(star_ratings))

    # Fill missing values in each list
    names_filled = fill_missing(names, max_length)
    review_counts_filled = fill_missing(review_counts, max_length)
    review_titles_filled = fill_missing(review_titles, max_length)
    review_texts_filled = fill_missing(review_texts, max_length)
    dates_filled = fill_missing(dates, max_length)
    star_ratings_filled = fill_missing(star_ratings, max_length)

    # Create DataFrame with filled lists
    df_reviews_filled = pd.DataFrame({
        'Customer Name': names_filled,
        'Review Count': review_counts_filled,
        'Review Title': review_titles_filled,
        'Review Text': review_texts_filled,
        'Date of Experience': dates_filled,
        'Star Rating': star_ratings_filled
    })

    # Append the data from this page to the main DataFrame
    all_data = pd.concat([all_data, df_reviews_filled], ignore_index=True)


# Now 'all_data' contains data from all pages
print(all_data.head())

      Customer Name Review Count  \
0  Unhappy Customer  77K reviews   
1          loh sung  63K reviews   
2            Taahir  34K reviews   
3          Scott G.     1 review   
4    mohinder sohal     1 review   

                                        Review Title  \
0                                   Useless chat bot   
1  Budget air always give the low rate but giving...   
2                                      Great service   
3                   Just book direct through airline   
4                 I booked to Amritsar India and my…   

                                         Review Text  \
0  I wanted to reschedule my flight but unable to...   
1  Budget air always give the low rate ,but never...   
2  Either book through the airline directly (shou...   
3  I booked to Amritsar India and my departure wa...   
4  I booked a return flight from London to Athens...   

                   Date of Experience             Star Rating  
0  Date of experience: 14 August 2023  Rated 

In [None]:
title_text = soup.find("title").get_text()
agencyname = title_text[:31]

In [None]:
all_data.to_excel('reviews_data.xlsx', sheet_name=agencyname, index=False)

# To download the file to yoviews_data.xlsx')
files.download('reviews_data.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>