<a href="https://colab.research.google.com/github/AnthonyRobert377/AnthonyRobert377/blob/main/Web_Scraping_from_TrustPilot_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

from datetime import datetime

def extract_reviews(page_url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(page_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    review_articles = soup.find_all('article', attrs={"data-service-review-card-paper": True})

    reviews_data = []
    for article in review_articles:
        review_heading = None
        review_text = None
        review_date = None
        rating = None
        first_name = None
        last_name = None

        # Extract reviewer full name
        name_tag = article.find('span', attrs={"data-consumer-name-typography": True})
        if name_tag:
            full_name = name_tag.get_text(strip=True)
            name_parts = full_name.split(" ", 1)
            first_name = name_parts[0]
            last_name = name_parts[1] if len(name_parts) > 1 else None

        # Extract heading/title
        heading_tag = article.find('h2', attrs={"data-service-review-title-typography": True})
        if heading_tag:
            review_heading = heading_tag.get_text(strip=True)

        # Extract review text
        text_tag = article.find('p', attrs={"data-service-review-text-typography": True})
        if text_tag:
            review_text = text_tag.get_text(strip=True)

        # Extract ISO datetime
        time_tag = article.find('time')
        if time_tag and time_tag.has_attr('datetime'):
            review_date = time_tag['datetime']
            try:
                review_date = datetime.fromisoformat(review_date.replace("Z", "+00:00")).date()
            except ValueError:
                pass

        # Extract rating
        header_div = article.find('div', attrs={"data-service-review-rating": True})
        if header_div:
            rating = header_div.get("data-service-review-rating")

        reviews_data.append({
            "First Name": first_name,
            "Last Name": last_name,
            "Review Heading": review_heading,
            "Review Text": review_text,
            "Review Date": review_date,
            "Rating": rating
        })

    return reviews_data


def extract_all_reviews(base_url, from_page=1, to_page=6):
    all_reviews = []
    for page in range(from_page, to_page + 1):
        page_url = f"{base_url}?page={page}"
        print(f"Scraping: {page_url}")
        all_reviews.extend(extract_reviews(page_url))
        sleep(1)  # Pause to avoid throttling
    return pd.DataFrame(all_reviews)

# Example usage:
base_url = "https://www.trustpilot.com/review/spinfinite.com"
df_reviews = extract_all_reviews(base_url, from_page=1, to_page=4)
print(df_reviews)



Scraping: https://www.trustpilot.com/review/spinfinite.com?page=1
Scraping: https://www.trustpilot.com/review/spinfinite.com?page=2
Scraping: https://www.trustpilot.com/review/spinfinite.com?page=3
Scraping: https://www.trustpilot.com/review/spinfinite.com?page=4
   First Name Last Name                                     Review Heading  \
0      Bhavin      None                                               None   
1    Alistair      None                                               None   
2        Kate      None                                               None   
3       Meegs      None                                               None   
4    Alistair      None                               Spinfinite is decent   
..        ...       ...                                                ...   
91    Brandon    Butler              Been playing on Spinfinite for three…   
92  Francesca      None                      Mission variety keeps things…   
93     Bonnie      None           

In [6]:
from google.colab import files
# Convert the DataFrame to a CSV file and download it
df_reviews.to_csv('reviews.csv', index=False)
files.download('reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>