# Data Scrappig using BeautifulSoup (Dynamic)

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import os

# Base URL of the Trustpilot reviews page
site_to_review = "sendle.com"
base_url = "https://www.trustpilot.com/review/" + site_to_review + "?page="
csv_filename = site_to_review.replace('.', '_') + ".csv"

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Function to scrape reviews from a single page
def scrape_page(page_number):
    url = f"{base_url}{page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    reviews = []

    review_sections = soup.find_all("section", class_="styles_reviewContentwrapper__W9Vqf")
    for review in review_sections:
        try:
            # Extract rating
            rating_tag = review.find("div", class_="star-rating_starRating__sdbkn")
            if rating_tag and rating_tag.img and "alt" in rating_tag.img.attrs:
                rating = int(rating_tag.img["alt"].split()[1])
            else:
                rating = None

            # Extract title
            title_tag = review.find("h2", class_="typography_heading-s__RxVny")
            title = title_tag.text.strip() if title_tag else None

            # Extract review text
            text_tag = review.find("p", class_="typography_body-l__v5JLj")
            text = text_tag.text.strip() if text_tag else None

            # Extract date
            date_tag = review.find("time")
            date = date_tag.text.strip() if date_tag else None

            # Append to reviews list
            if rating and title and text and date:
                reviews.append({
                    "rating": rating,
                    "title": title,
                    "text": text,
                    "date": date
                })
        except Exception as e:
            print(f"Error extracting review: {e}")

    return reviews

# Check if the CSV file already exists
if os.path.exists(csv_filename):
    print(f"{csv_filename} already exists. Skipping scraping.")
else:
    # Send a GET request to fetch the first page content
    response = requests.get(base_url + "1", headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the pagination button with name="pagination-button-last"
    pagination_button = soup.find("a", {"name": "pagination-button-last"})
    if pagination_button:
        last_page_number = int(pagination_button["aria-label"].split()[-1])
        print(f"Last page number: {last_page_number}")
    else:
        last_page_number = 1

    # List to store all reviews
    all_reviews = []

    # Scrape all pages
    for page in range(1, last_page_number + 1):
        print(f"Scraping page {page}...")
        page_reviews = scrape_page(page)
        all_reviews.extend(page_reviews)

    # Save all reviews to a CSV file
    with open(csv_filename, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["rating", "title", "text", "date"])
        writer.writeheader()
        writer.writerows(all_reviews)

    print(f"Scraped {len(all_reviews)} reviews and saved to {csv_filename}.")

sendle_com.csv already exists. Skipping scraping.


# Preprocess the data.

In [3]:

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Function to preprocess review text
def preprocess_text(text):
    """
    Preprocesses the input text by tokenizing, converting to lowercase,
    removing stopwords, and lemmatizing.
    """
    # Initialize stop words and lemmatizer
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text into words
    words = word_tokenize(text.lower())  # Convert to lowercase for consistency

    # Remove stop words and lemmatize
    processed_words = [
        lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words
    ]

    # Join the processed words back into a single string
    return " ".join(processed_words)

def preprocess_csv(input_csv):
    """
    Reads a CSV file, preprocesses the 'text' column, and saves the
    preprocessed data to a new CSV file.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Check if 'text' column exists
    if "text" not in df.columns:
        raise ValueError("The CSV file must have a 'text' column containing the reviews.")

    # Apply preprocessing to the 'text' column
    print("Preprocessing reviews...")
    df["processed_text"] = df["text"].apply(preprocess_text)

    # Save the updated DataFrame to a new CSV file
    output_csv = input_csv.replace(".csv", "_processed.csv")
    df.to_csv(output_csv, index=False)
    print(f"Preprocessed data saved to {output_csv}")

# Example usage
if __name__ == "__main__":
    # Input CSV file name
    input_csv_file = "sendle_com.csv"
    try:
        preprocess_csv(input_csv_file)
    except Exception as e:
        print(f"Error: {e}")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...


Preprocessing reviews...
Preprocessed data saved to sendle_com_processed.csv


# Sentiment Analysis using RoBERTa.

In [4]:
import pandas as pd
from transformers import pipeline
import os

def analyze_sentiment(input_csv):
    """
    Analyzes sentiment of reviews in the input CSV file using a RoBERTa model
    and saves the results to a new CSV file.

    Parameters:
    - input_csv (str): Path to the input CSV file containing a 'processed_text' column.

    Output:
    - A new CSV file with a 'sentiment' column added.
    """
    # Define the output CSV file name
    output_csv = input_csv.replace(".csv", "_with_sentiment.csv")

    # Check if the output CSV file already exists
    if os.path.exists(output_csv):
        print(f"{output_csv} already exists. Skipping sentiment analysis.")
        return

    # Load the input CSV file
    df = pd.read_csv(input_csv)

    # Check if 'processed_text' column exists
    if "processed_text" not in df.columns:
        raise ValueError("The CSV file must have a 'processed_text' column.")

    # Initialize the Hugging Face sentiment analysis pipeline
    sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

    # Map RoBERTa labels to human-readable sentiment labels
    label_map = {
        "LABEL_0": "NEGATIVE",
        "LABEL_1": "NEUTRAL",
        "LABEL_2": "POSITIVE"
    }

    # Function to analyze sentiment with error handling
    def analyze_text(text):
        try:
            result = sentiment_analyzer(text)[0]
            return label_map.get(result["label"], "UNKNOWN")
        except Exception as e:
            print(f"Error analyzing text: {e}")
            return None

    # Analyze sentiment for each review
    print("Analyzing sentiment...")
    df["sentiment"] = df["processed_text"].apply(analyze_text)

    # Save the results to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Sentiment analysis complete. Results saved to {output_csv}")

# Example usage
if __name__ == "__main__":
    input_csv_file = "sendle_com_processed.csv"
    try:
        analyze_sentiment(input_csv_file)
    except Exception as e:
        print(f"Error: {e}")

  from .autonotebook import tqdm as notebook_tqdm


sendle_com_processed_with_sentiment.csv already exists. Skipping sentiment analysis.


# Theme Detection (LDA) + Vectorize TF-IDF

In [5]:
import pandas as pd
import os
import re

def detect_themes_static(input_csv):
    # Define the static themes and their keywords
    theme_keywords = {
        "Delivery Issues": [
            "not picked", "missed", "failed", "lost", "late", "delayed", "undelivered", "error", 
            "rescheduled", "problem", "loss", "stolen", "never arrived", "never received", 
            "no pick up", "re-schedule", "never showed", "missed pickup", "missed delivery",
            "failed delivery", "lost parcel", "late delivery", "delayed delivery", "undelivered parcel",
            "parcel pickup", "pickup delay", "pickup failed", "not picked up",
            "address wrong", "wrong address", "wrong location", "wrong recipient", "wrong recipient",
            "disappeared", "disappearance", "vanished", "vanishing", "gone", "missing", "missing parcel", "disspear"
        ],
        "Poor Customer Service": [
            "poor", "no response", "unhelpful", "rude", "bad", "terrible", "support", "ignored", 
            "unresponsive", "frustrating", "lack", "disappointing", "no manager", 
            "manager never called", "follow up", "continuous problems", "zero contact",
            "no customer service", "no customer support", "no customer care", "no customer help",
            "unhelpful response", "refuse to tell", "vague response", "poor service","useless","customer service is worse","no sign"
        ],
        "Unreliable Service": [
            "unreliable", "failed", "chaotic", "inconsistent", "unpredictable", "untrustworthy", 
            "unstable", "erratic", "spotty", "hit", "miss", "undependable", "unreliability",
            "unreliable service", "unreliable company", "unreliable delivery", "unreliable courier",
            "ripoff", "failed to deliver", "not shipped", "never arrived", "delayed delivery"
        ],
        "Lack of Communication": [
            "no updates", "no response", "no info", "lack", "silent", "uncommunicative", "no feedback", 
            "no follow", "no status", "no clarity", "no notification", "no confirmation", "no tracking",
            "no communication", "no contact", "no call", "no email", "no message", "no text", "no reply",
            "unresponsive", "ignored", "unhelpful", "unreliable", "unprofessional", "untrustworthy",
            "poor communication", "poor response", "poor feedback", "poor follow", "poor status"
        ],
        "Driver Problems": [
            "driver", "rude", "late", "no show", "unprofessional", "lost", "careless", "aggressive", 
            "negligent", "confused", "error", "incompetent", "reckless", "unreliable", 
            "never shows", "no show driver", "did not show up", "failed pick up",
            "driver never arrived", "driver never showed", "driver never came", "driver never picked up",
            "swearing", "swore", "sworn", "swear", "cursing", "cursed", "curse", "cussing", "cussed",
            "crash", "crashed", "crashing", "accident", "accidental", "accidentally", "collision","No one came"
        ],
        "Parcel Handling Problems": [
            "lost", "destroyed", "returned", "damaged", "missing", "broken", "opened", "stolen", 
            "misplaced", "mishandled", "crushed", "wet", "torn", "tampered", "ruined", "spoiled",
            "mangled", "smashed", "dented", "cracked", "scratched", "shattered", "crumpled", "soaked",
            "smash", "crush", "crumple", "soak", "soaked", "smashed", "crushed", "crumpled", "soaked",
            "thrown", "throw", "toss", "tossed", "throwing", "tossing", "throw away", "toss away",
            "throwing away", "tossing away", "throw out", "toss out", "throwing out", "tossing out","curbside"
        ],
        "Cost and Value Concerns": [
            "affordable", "expensive", "overpriced", "costly", "pricey", "cheap", "unreasonable", 
            "value", "fees", "hidden", "charges", "expensive", "money", "waste", "budget", "pricing",
            "reasonable", "inexpensive", "cost-effective", "cost-efficient", "cost-saving", "charge"
        ],
        "Convenience and Ease": [
            "easy", "convenient", "simple", "smooth", "quick", "efficient", "accessible", 
            "flexible", "intuitive", "hassle", "user-friendly", "straightforward", "seamless","Ease"
        ],
        "Timeliness and Speed": [
            "delayed", "late", "slow", "fast", "timely", "prompt", "speedy", "on time", 
            "expedited", "rush", "quick", "efficient", "time-sensitive", "punctual", "swift",
            "timeframe", "time", "time-consuming", "time-wasting", "time-management", "time-saving",
            "time-efficient", "time-critical", "time-sensitive", "time-consuming", "time-wasting",
            "wait", "waiting", "waiting time", "waiting period", "waiting list", "waiting room",
            "waiting area", "waiting line", "waiting game", "waiting period", "waiting time","after","takes days","days","how quickly"
        ],
        "Service Variability": [
            "varies", "inconsistent", "unpredictable", "spotty", "mixed", "hit", "miss", 
            "dependent", "disparity", "uneven", "fluctuating", "varying", "inconclusive", "inconclusive"
        ],
        "Frustration and Stress": [
            "frustrating", "stressful", "annoying", "irritating", "infuriating", "maddening", 
            "exasperating", "disappointing", "upsetting", "aggravating",
            "stress", "frustration", "disappointment","worst", "bad", "terrible", "horrible", "awful",
            "hate", "dislike", "displeased", "unhappy", "regret", "regretful",
            "angry", "anger", "mad", "madness", "irritated", "irritation", "annoyed", "annoyance",
            "shocking","Dont use","ridiculous","No word","never shop","t use","joke","fooled"
        ],
        "Shipping Issues": [
            "picked up", "dropped off", "pickup", "drop off", "pickup failed", "parcel", "not picked", 
            "pick-up error", "late pickup", "missing pickup", "not picked up", "drop off issue", 
            "high volume", "pick-up delay", "never picked up","outsourced my packages","haven't received","delivered never","never got scanned"
        ],
        "Positive Experiences": [
            "amazing", "great", "fantastic", "wonderful", "satisfying", "pleasant", 
            "impressive", "awesome", "top-notch", "smooth", "outstanding", "worth", "happy", "pleased",
            "excellent", "love", "recommend", "recommendation", "recommendable",
            "joy", "joyful", "satisfied", "satisfaction", "happy", "happiness", "pleased", "pleasure",
            "impressed", "impressive", "awesome", "awesomeness", "top-notch", "outstanding", "worth","that's all I need",
            "excellent", "is good", "love", "recommend", "recommendation", "recommendable","quickly","helped","perfect","as ordered","Thank you","good price"
            "worked hard", "help resolve issues", "is helpful", "helpful staff", "helpful team","Good","wont use any other","fair price","sweet","perfect","Easier to use","was then able"
        ],
        "Positive Feedback": [
            "amazing", "great", "fantastic", "wonderful", "satisfying", "pleased", "impressive", 
            "awesome", "top-notch", "outstanding", "happy", "worth", "excellent", "is good", "best", "love","incredible","Reliable","friendly ","well","Darling","Accurate"
        ],
        "Negative Experiences": [
            "frustrating", "infuriating", "upsetting", "terrible", "awful", "horrible", "worst", "dislike", 
            "hate", "displeased", "unhappy", "disappointment", "stress", "maddening", "irritating", 
            "annoying", "exasperating","scam"
        ]
    }

    # Define the output CSV file name
    output_csv = input_csv.replace(".csv", "_with_static_themes.csv")

    # Check if the output CSV file already exists
    if os.path.exists(output_csv):
        print(f"{output_csv} already exists. Skipping theme detection.")
        return

    # Load the CSV file
    df = pd.read_csv(input_csv)

    # Combine 'title' and 'text' columns with a space
    df["combined_text"] = df["text"].fillna("")

    # Handle missing values in the 'combined_text' column
    df["combined_text"] = df["combined_text"].fillna("")

    # Initialize a list to store the detected themes for each review
    detected_themes = []

    # Iterate through each review and assign the most relevant theme
    for text in df["combined_text"]:
        theme_detected = "No Theme Detected"  # Default theme
        for theme, keywords in theme_keywords.items():
            for keyword in keywords:
                # Use regex to match whole words only
                if re.search(r'\b' + re.escape(keyword) + r'\b', text, flags=re.IGNORECASE):
                    theme_detected = theme
                    break  # Stop checking other keywords for this theme
            if theme_detected != "No Theme Detected":
                break  # Stop checking other themes once a theme is detected
        detected_themes.append(theme_detected)

    # Add the detected themes to the DataFrame
    df["theme"] = detected_themes

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Static theme detection complete. Results saved to {output_csv}")

# Example usage
if __name__ == "__main__":
    input_csv_file = "sendle_com_processed_with_sentiment.csv"

    try:
        detect_themes_static(input_csv_file)
    except Exception as e:
        print(f"Error: {e}")

Static theme detection complete. Results saved to sendle_com_processed_with_sentiment_with_static_themes.csv
