In [None]:
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time


def setup_request():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    return headers


def get_restaurant_info(soup):
    restaurant_info = {
        'name': '',
        'price_level': '',
        'cuisine_type': '',
        'total_rating': '',
        'total_reviews': '',
        'food_rating': '',
        'service_rating': '',
        'value_rating': '',
        'atmosphere_rating': '',
        'ranking': '',
        'city': '',
        'address': '',
        'phone_no': ''
    }

    try:
        restaurant_info['name'] = soup.find('h1').text.strip()
    except:
        restaurant_info['name'] = 'N/A'

    try:
        general_infos = soup.find('span', class_='rRtyp').text.strip()
        info_parts = general_infos.split(', ')
        restaurant_info['price_level'] = info_parts[0]
        restaurant_info['cuisine_type'] = ', '.join(info_parts[1:])
    except:
        pass

    detail_cards = soup.find_all('div', attrs={'data-automation': 'OVERVIEW_TAB_ELEMENT'})
    if detail_cards:
        rating_info = detail_cards[0]
        try:
            restaurant_info['total_rating'] = rating_info.find('span', class_='biGQs').text.strip()
            reviews_text = rating_info.find('div', class_='KxBGd').text.strip()
            restaurant_info['total_reviews'] = reviews_text.replace(' reviews', '').replace(',', '')
        except:
            pass

        try:
            rating_container = rating_info.find('div', class_='khxWm')
            if rating_container:
                rating_category = rating_container.find_all('div', class_='YwaWb')
                if len(rating_category) >= 4:
                    restaurant_info['food_rating'] = rating_category[0].find('svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
                    restaurant_info['service_rating'] = rating_category[1].find('svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
                    restaurant_info['value_rating'] = rating_category[2].find('svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
                    restaurant_info['atmosphere_rating'] = rating_category[3].find('svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
        except Exception as e:
            logging.error(f"Error extracting detailed ratings: {str(e)}")

        try:
            ranking_tag = rating_info.find_all('a', class_='BMQDV')
            if len(ranking_tag) > 1:
                ranking_text = ranking_tag[1].find('span').text.strip()
                restaurant_info['ranking'] = ranking_text.split()[0].replace('#', '')
                in_index = ranking_text.split().index('in')
                restaurant_info['city'] = ' '.join(ranking_text.split()[in_index + 1:])
        except:
            pass

    if len(detail_cards) > 2:
        location_info = detail_cards[2]
        try:
            restaurant_info['address'] = location_info.find('span', class_='biGQs').text.strip()
        except:
            pass

        try:
            phone_link = location_info.find('a', attrs={'aria-label': 'Call'})
            if phone_link:
                restaurant_info['phone_no'] = phone_link.get('href').replace('tel:', '')
        except Exception as e:
            logging.error(f"Error extracting phone number: {str(e)}")

    return restaurant_info


def scrape_reviews(soup):
    reviews = []
    review_cards = soup.find_all('div', attrs={'data-automation': 'reviewCard'})

    for review in review_cards:
        review_data = {
            'rating': '',
            'title': '',
            'text': '',
            'date': ''
        }

        rating_element = review.find('svg', class_='UctUV')
        if rating_element:
            review_data['rating'] = rating_element.find('title').text.strip().replace(' of 5 bubbles', '')

        title_element = review.find('div', attrs={'data-test-target': 'review-title'})
        if title_element:
            review_data['title'] = title_element.text.strip()

        text_element = review.find('div', attrs={'data-test-target': 'review-body'})
        if text_element:
            review_data['text'] = text_element.text.strip()

        date_element = review.find('div', class_='neAPm')
        if date_element:
            child_divs = date_element.find_all('div')
            if child_divs:
                review_data['date'] = child_divs[0].text.strip().replace('Written ', '')

        reviews.append(review_data)
        time.sleep(1)

    return reviews


def generate_review_urls(base_url, total_reviews, reviews_per_page=10):
    pages = range(0, total_reviews, reviews_per_page)
    urls = []
    for page in pages:
        if page == 0:
            urls.append(base_url)
        else:
            paginated_url = base_url.replace("Reviews-", f"Reviews-or{page}-")
            urls.append(paginated_url)
    return urls


def save_to_csv(restaurant_info, reviews, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        header = ['RESTAURANT_NAME', 'PRICE_LEVEL', 'CUISINE_TYPE', 'TOTAL_RATING',
                  'TOTAL_REVIEWS', 'FOOD_RATING', 'SERVICE_RATING', 'VALUE_RATING',
                  'ATMOSPHERE_RATING', 'RANKING', 'CITY', 'ADDRESS', 'PHONE_NO',
                  'RATING', 'REVIEW_TITLE', 'REVIEW_TEXT', 'REVIEW_DATE']
        writer.writerow(header)

        for review in reviews:
            row = [
                restaurant_info['name'],
                restaurant_info['price_level'],
                restaurant_info['cuisine_type'],
                restaurant_info['total_rating'],
                restaurant_info['total_reviews'],
                restaurant_info['food_rating'],
                restaurant_info['service_rating'],
                restaurant_info['value_rating'],
                restaurant_info['atmosphere_rating'],
                restaurant_info['ranking'],
                restaurant_info['city'],
                restaurant_info['address'],
                restaurant_info['phone_no'],
                review['rating'],
                review['title'],
                review['text'],
                review['date']
            ]
            writer.writerow(row)


def main():
    base_url = 'https://www.tripadvisor.com.my/Restaurant_Review-g298313-d15006574-Reviews-Mean_Mince-Petaling_Jaya_Petaling_District_Selangor.html'
    headers = setup_request()

    try:
        response = requests.get(base_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        time.sleep(5)

        restaurant_info = get_restaurant_info(soup)
        total_reviews = int(restaurant_info['total_reviews'])

        print(f"Total reviews found: {total_reviews}")
        urls = generate_review_urls(base_url, total_reviews)

        all_reviews = []

        for url in urls:
            print(f"Scraping: {url}")
            res = requests.get(url, headers=headers)
            page_soup = BeautifulSoup(res.content, 'html.parser')
            reviews = scrape_reviews(page_soup)
            all_reviews.extend(reviews)
            time.sleep(3)

        save_to_csv(restaurant_info, all_reviews, 'mince_meat_reviews_all.csv')
        print("✅ All reviews scraped and saved!")

    except requests.exceptions.RequestException as e:
        logging.error(f"Error during requests to {base_url} : {str(e)}")


if __name__ == "__main__":
    main()

In [None]:
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time


def setup_request():

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

    return headers


def get_restaurant_info(soup):
    restaurant_info = {
        'name': '',
        'price_level': '',
        'cuisine_type': '',
        'total_rating': '',
        'total_reviews': '',
        'food_rating': '',
        'service_rating': '',
        'value_rating': '',
        'atmosphere_rating': '',
        'ranking': '',
        'city': '',
        'address': '',
        'phone_no': ''
    }

    restaurant_info['name'] = soup.find('h1').text.strip()

    # General info processing
    general_infos = soup.find('span', class_='KxBGd').text.strip()
    info_parts = general_infos.split(', ')
    restaurant_info['price_level'] = info_parts[0]
    restaurant_info['cuisine_type'] = ', '.join(info_parts[1:])

    # Rating and review info
    detail_cards = soup.find_all(
        'div', attrs={'data-automation': 'OVERVIEW_TAB_ELEMENT'})
    if detail_cards:
        rating_info = detail_cards[0]
        restaurant_info['total_rating'] = rating_info.find(
            'span', class_='uuBRH').text.strip()
        reviews_text = rating_info.find('div', class_='KxBGd').text.strip()
        restaurant_info['total_reviews'] = reviews_text.replace(' reviews', '')

        # Detailed ratings
        try:
            rating_container = rating_info.find('div', class_='khxWm')
            if rating_container:
                rating_category = rating_container.find_all(
                    'div', class_='YwaWb')
                if len(rating_category) >= 4:
                    restaurant_info['food_rating'] = rating_category[0].find(
                        'svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
                    restaurant_info['service_rating'] = rating_category[1].find(
                        'svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
                    restaurant_info['value_rating'] = rating_category[2].find(
                        'svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
                    restaurant_info['atmosphere_rating'] = rating_category[3].find(
                        'svg', class_='UctUV').find('title').text.strip().replace(' of 5 bubbles', '')
        except Exception as e:
            logging.error(f"Error extracting detailed ratings: {str(e)}")

        # Ranking and city info
        ranking_tag = rating_info.find_all('a', class_='ffHql')
        if len(ranking_tag) > 1:
            ranking_text = ranking_tag[1].find('span').text.strip()
            restaurant_info['ranking'] = ranking_text.split()[
                0].replace('#', '')
            in_index = ranking_text.split().index('in')
            restaurant_info['city'] = ' '.join(
                ranking_text.split()[in_index + 1:])

    # Address and phone info
    if len(detail_cards) > 2:
        location_info = detail_cards[2]
        restaurant_info['address'] = location_info.find(
            'span', class_='biGQs').text.strip()

        # Phone number
        try:
            phone_link = location_info.find('a', attrs={'aria-label': 'Call'})
            if phone_link:
                restaurant_info['phone_no'] = phone_link.get(
                    'href').replace('tel:', '')
        except Exception as e:
            logging.error(f"Error extracting phone number: {str(e)}")

    return restaurant_info


def scrape_reviews(soup):
    reviews = []
    review_cards = soup.find_all(
        'div', attrs={'data-automation': 'reviewCard'})

    for review in review_cards:
        review_data = {
            'rating': '',
            'title': '',
            'text': '',
            'date': ''
        }

        rating_element = review.find('svg', class_='UctUV')
        if rating_element:
            review_data['rating'] = rating_element.find(
                'title').text.strip().replace(' of 5 bubbles', '')

        title_element = review.find(
            'div', attrs={'data-test-target': 'review-title'})
        if title_element:
            review_data['title'] = title_element.text.strip()

        text_element = review.find(
            'div', attrs={'data-test-target': 'review-body'})
        if text_element:
            review_data['text'] = text_element.text.strip()

        date_element = review.find('div', class_='neAPm')
        if date_element:
            child_divs = date_element.find_all('div')
            if child_divs:
                review_data['date'] = child_divs[0].text.strip().replace(
                    'Written ', '')

        reviews.append(review_data)
        time.sleep(3)

    return reviews


def save_to_csv(restaurant_info, reviews, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # Write header
        header = ['RESTAURANT_NAME', 'PRICE_LEVEL', 'CUISINE_TYPE', 'TOTAL_RATING',
                  'TOTAL_REVIEWS', 'FOOD_RATING', 'SERVICE_RATING', 'VALUE_RATING',
                  'ATMOSPHERE_RATING', 'RANKING', 'CITY', 'ADDRESS', 'PHONE_NO',
                  'RATING', 'REVIEW_TITLE', 'REVIEW_TEXT', 'REVIEW_DATE']
        writer.writerow(header)

        # Write reviews with restaurant info
        for review in reviews:
            row = [
                restaurant_info['name'],
                restaurant_info['price_level'],
                restaurant_info['cuisine_type'],
                restaurant_info['total_rating'],
                restaurant_info['total_reviews'],
                restaurant_info['food_rating'],
                restaurant_info['service_rating'],
                restaurant_info['value_rating'],
                restaurant_info['atmosphere_rating'],
                restaurant_info['ranking'],
                restaurant_info['city'],
                restaurant_info['address'],
                restaurant_info['phone_no'],
                review['rating'],
                review['title'],
                review['text'],
                review['date']
            ]
            writer.writerow(row)


def main():
    url = 'https://www.tripadvisor.com.my/Restaurant_Review-g298317-d16090258-Reviews-Spade_s_Burger-Subang_Jaya_Petaling_District_Selangor.html'
    headers = setup_request()

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        time.sleep(10)

        restaurant_info = get_restaurant_info(soup)
        reviews = scrape_reviews(soup)

        save_to_csv(restaurant_info, reviews,
                    'spades_burger_reviews.csv')
        print("All information saved successfully")

    except requests.exceptions.RequestException as e:
        logging.error(f"Error during requests to {url} : {str(e)}")


if __name__ == "__main__":
    main()

In [None]:
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time


def setup_request():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    return headers


def get_restaurant_name(soup):
    try:
        return soup.find('h1').text.strip()
    except:
        return 'N/A'


def scrape_reviews(soup):
    reviews = []
    review_cards = soup.find_all('div', attrs={'data-automation': 'reviewCard'})

    for review in review_cards:
        review_data = {
            'rating': '',
            'title': '',
            'text': '',
            'date': ''
        }

        rating_element = review.find('svg', class_='UctUV')
        if rating_element:
            review_data['rating'] = rating_element.find('title').text.strip().replace(' of 5 bubbles', '')

        title_element = review.find('div', attrs={'data-test-target': 'review-title'})
        if title_element:
            review_data['title'] = title_element.text.strip()

        text_element = review.find('div', attrs={'data-test-target': 'review-body'})
        if text_element:
            review_data['text'] = text_element.text.strip()

        date_element = review.find('div', class_='neAPm')
        if date_element:
            child_divs = date_element.find_all('div')
            if child_divs:
                review_data['date'] = child_divs[0].text.strip().replace('Written ', '')

        reviews.append(review_data)
        time.sleep(1)

    return reviews


def save_to_csv(restaurant_name, reviews, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['RESTAURANT_NAME', 'RATING', 'REVIEW_TITLE', 'REVIEW_TEXT', 'REVIEW_DATE'])

        for review in reviews:
            writer.writerow([
                restaurant_name,
                review['rating'],
                review['title'],
                review['text'],
                review['date']
            ])


def main():
    url = 'https://www.tripadvisor.com.my/Restaurant_Review-g298570-d27935260-Reviews-Woodfire_Kl_ttdi-Kuala_Lumpur_Wilayah_Persekutuan.html'
    headers = setup_request()

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        time.sleep(5)

        restaurant_name = get_restaurant_name(soup)
        reviews = scrape_reviews(soup)

        save_to_csv(restaurant_name, reviews, 'woodfire_kl_reviews.csv')
        print(f"✅ Reviews for '{restaurant_name}' saved successfully!")

    except requests.exceptions.RequestException as e:
        logging.error(f"Error during requests to {url} : {str(e)}")


if __name__ == "__main__":
    main()

In [None]:
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time


def setup_request():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    return headers


def get_restaurant_name(soup):
    try:
        return soup.find('h1').text.strip()
    except:
        return 'N/A'


def scrape_reviews(soup):
    reviews = []
    review_cards = soup.find_all('div', attrs={'data-automation': 'reviewCard'})

    for review in review_cards:
        review_data = {
            'rating': '',
            'title': '',
            'text': '',
            'date': ''
        }

        rating_element = review.find('svg', class_='UctUV')
        if rating_element:
            review_data['rating'] = rating_element.find('title').text.strip().replace(' of 5 bubbles', '')

        title_element = review.find('div', attrs={'data-test-target': 'review-title'})
        if title_element:
            review_data['title'] = title_element.text.strip()

        text_element = review.find('div', attrs={'data-test-target': 'review-body'})
        if text_element:
            review_data['text'] = text_element.text.strip()

        date_element = review.find('div', class_='neAPm')
        if date_element:
            child_divs = date_element.find_all('div')
            if child_divs:
                review_data['date'] = child_divs[0].text.strip().replace('Written ', '')

        reviews.append(review_data)
        time.sleep(1)

    return reviews


def get_next_page(soup):
    try:
        # Find the "Next" page link by looking for the "next" button or pagination link
        next_button = soup.find('a', class_='ui_button nav next primary')
        if next_button and 'href' in next_button.attrs:
            return next_button['href']
        return None
    except:
        return None


def save_to_csv(restaurant_name, reviews, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['RESTAURANT_NAME', 'RATING', 'REVIEW_TITLE', 'REVIEW_TEXT', 'REVIEW_DATE'])

        for review in reviews:
            writer.writerow([
                restaurant_name,
                review['rating'],
                review['title'],
                review['text'],
                review['date']
            ])


def main():
    url = 'https://www.tripadvisor.com.my/Restaurant_Review-g298570-d25153683-Reviews-Burger_And_Lobster_Klcc-Kuala_Lumpur_Wilayah_Persekutuan.html'
    headers = setup_request()

    all_reviews = []
    page_number = 1

    try:
        while url:
            print(f"Scraping page {page_number}...")
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            restaurant_name = get_restaurant_name(soup)
            reviews = scrape_reviews(soup)

            all_reviews.extend(reviews)  # Append new reviews to the list

            # Check if there's a next page
            url = get_next_page(soup)
            if url:
                # Ensure we get the full URL if it's a relative link
                if not url.startswith('http'):
                    base_url = 'https://www.tripadvisor.com.my'
                    url = base_url + url
                page_number += 1
                time.sleep(2)  # Respectful delay between requests
            else:
                print("No more pages found.")
                break

        # After scraping all pages, save the reviews to a CSV file
        save_to_csv(restaurant_name, all_reviews, 'burger_and_lobster_kl_all_reviews.csv')
        print(f"✅ All reviews for '{restaurant_name}' saved successfully!")

    except requests.exceptions.RequestException as e:
        logging.error(f"Error during requests to {url} : {str(e)}")


if __name__ == "__main__":
    main()



In [1]:
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time

def setup_request():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    return headers

def get_restaurant_name(soup):
    try:
        return soup.find('h1').text.strip()
    except:
        return 'N/A'

def scrape_reviews(soup):
    reviews = []
    review_cards = soup.find_all('div', attrs={'data-automation': 'reviewCard'})

    for review in review_cards:
        review_data = {
            'rating': '',
            'title': '',
            'text': '',
            'date': ''
        }

        rating_element = review.find('svg', class_='UctUV')
        if rating_element:
            review_data['rating'] = rating_element.find('title').text.strip().replace(' of 5 bubbles', '')

        title_element = review.find('div', attrs={'data-test-target': 'review-title'})
        if title_element:
            review_data['title'] = title_element.text.strip()

        text_element = review.find('div', attrs={'data-test-target': 'review-body'})
        if text_element:
            review_data['text'] = text_element.text.strip()

        date_element = review.find('div', class_='neAPm')
        if date_element:
            child_divs = date_element.find_all('div')
            if child_divs:
                review_data['date'] = child_divs[0].text.strip().replace('Written ', '')

        reviews.append(review_data)
        time.sleep(1)

    return reviews

def save_to_csv(restaurant_name, reviews, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['RESTAURANT_NAME', 'RATING', 'REVIEW_TITLE', 'REVIEW_TEXT', 'REVIEW_DATE'])

        for review in reviews:
            writer.writerow([
                restaurant_name,
                review['rating'],
                review['title'],
                review['text'],
                review['date']
            ])

def scrape_all_reviews(base_url):
    page_number = 0
    all_reviews = []
    while True:
        url = f"{base_url}&start={page_number}"
        headers = setup_request()

        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            time.sleep(5)

            # Scrape reviews from this page
            reviews = scrape_reviews(soup)
            if not reviews:
                break  # Stop if no reviews are found (i.e., we've reached the last page)

            all_reviews.extend(reviews)

            # Check if there's a next page
            next_page = soup.find('a', class_='unMKR')
            if next_page:
                page_number += 10  # Increment by 10 (the standard increment for page navigation on TripAdvisor)
            else:
                break  # No more pages, stop the loop

        except requests.exceptions.RequestException as e:
            logging.error(f"Error during requests to {url} : {str(e)}")
            break

    return all_reviews

def main():
    base_url = 'https://www.tripadvisor.com.my/Restaurant_Review-g298313-d7281339-Reviews-MyBurgerLab-Petaling_Jaya_Petaling_District_Selangor.html'
    
    try:
        headers = setup_request()
        response = requests.get(base_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        time.sleep(5)

        restaurant_name = get_restaurant_name(soup)
        
        # Scrape all reviews across pages
        all_reviews = scrape_all_reviews(base_url)

        save_to_csv(restaurant_name, all_reviews, 'woodfire_kl_reviews_2a.csv')
        print(f"✅ Reviews for '{restaurant_name}' saved successfully!")

    except requests.exceptions.RequestException as e:
        logging.error(f"Error during requests to {base_url} : {str(e)}")

if __name__ == "__main__":
    main()


✅ Reviews for 'myBurgerLab' saved successfully!


In [None]:
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time


def setup_request():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    return headers


def get_restaurant_name(soup):
    try:
        return soup.find('h1').text.strip()
    except:
        return 'N/A'


def scrape_reviews(soup):
    reviews = []
    review_cards = soup.find_all('div', attrs={'data-automation': 'reviewCard'})

    for review in review_cards:
        review_data = {
            'rating': '',
            'title': '',
            'text': '',
            'date': ''
        }

        rating_element = review.find('svg', class_='UctUV')
        if rating_element:
            review_data['rating'] = rating_element.find('title').text.strip().replace(' of 5 bubbles', '')

        title_element = review.find('div', attrs={'data-test-target': 'review-title'})
        if title_element:
            review_data['title'] = title_element.text.strip()

        text_element = review.find('div', attrs={'data-test-target': 'review-body'})
        if text_element:
            review_data['text'] = text_element.text.strip()

        date_element = review.find('div', class_='neAPm')
        if date_element:
            child_divs = date_element.find_all('div')
            if child_divs:
                review_data['date'] = child_divs[0].text.strip().replace('Written ', '')

        reviews.append(review_data)
        time.sleep(1)

    return reviews


def get_next_page(soup):
    try:
        # Find the "Next" page link by looking for the "next" button or pagination link
        next_button = soup.find('a', class_='BrOJk')  # Replace with your found class
        if next_button and 'href' in next_button.attrs:
            return next_button['href']
        return None
    except:
        return None


def save_to_csv(restaurant_name, reviews, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['RESTAURANT_NAME', 'RATING', 'REVIEW_TITLE', 'REVIEW_TEXT', 'REVIEW_DATE'])

        for review in reviews:
            writer.writerow([
                restaurant_name,
                review['rating'],
                review['title'],
                review['text'],
                review['date']
            ])


def main():
    url = 'https://www.tripadvisor.com.my/Restaurant_Review-g298570-d25153683-Reviews-Burger_And_Lobster_Klcc-Kuala_Lumpur_Wilayah_Persekutuan.html'
    headers = setup_request()

    all_reviews = []
    page_number = 1

    try:
        while url:
            print(f"Scraping page {page_number}...")
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            restaurant_name = get_restaurant_name(soup)
            reviews = scrape_reviews(soup)

            all_reviews.extend(reviews)  # Append new reviews to the list

            # Check if there's a next page
            next_page = get_next_page(soup)
            if next_page:
                # Ensure we get the full URL if it's a relative link
                if not next_page.startswith('http'):
                    base_url = 'https://www.tripadvisor.com.my'
                    next_page = base_url + next_page
                url = next_page
                page_number += 1
                time.sleep(2)  # Respectful delay between requests
            else:
                print("No more pages found.")
                break

        # After scraping all pages, save the reviews to a CSV file
        save_to_csv(restaurant_name, all_reviews, 'burger_and_lobster_kl_all_reviews_2a.csv')
        print(f"✅ All reviews for '{restaurant_name}' saved successfully!")

    except requests.exceptions.RequestException as e:
        logging.error(f"Error during requests to {url} : {str(e)}")


if __name__ == "__main__":
    main()


In [25]:
import pandas as pd
import os

def list_and_combine_csv_files(input_folder, output_file, include_files):
    all_data = []
    folder_files = os.listdir(input_folder)

    print(f"Files found in folder: {folder_files}")  # Debugging step

    # Loop through each file in the folder
    for file_name in folder_files:
        # Check if it's a CSV file and in the list of files to combine
        if file_name.endswith('.csv') and file_name in include_files:  
            file_path = os.path.join(input_folder, file_name)
            print(f"Adding file: {file_name}")  # Optional: print which file is being added
            # Read the CSV file into a DataFrame and append it to the list
            df = pd.read_csv(file_path)
            all_data.append(df)

    if not all_data:
        print("No matching CSV files found to combine.")
        return

    combined_data = pd.concat(all_data, ignore_index=True)
    combined_data.to_csv(output_file, index=False)
    print(f"✅ Selected CSV files combined successfully into '{output_file}'.")

# Usage:
input_folder = r'C:\Users\tiemi\Downloads'  # Corrected path with raw string
output_file = 'tripadvisor_reviews_burgershops_kl.csv'    # Renamed output file
include_files = ['myburgerlab_all_reviews.csv', 'burger_and_lobster_kl_all_reviews.csv', 'burger_bakar_abang_burn_reviews.csv', 'burger_ji_legend_all_reviews.csv', 'burger_on_16_reviews.csv', 'fuelshack_kl_all_reviews.csv', 'gastro_sentral_all_reviews.csv', 'mince_meat_reviews.csv', 'myburgerlab_all_reviews.csv', 'spades_burger_reviews.csv', 'woodfire_kl_all_reviews.csv']  # List the filenames you want to combine

list_and_combine_csv_files(input_folder, output_file, include_files)

Files found in folder: ['.ipynb_checkpoints', '08622551.pdf', '1-s2.0-S0378437119309999-main.pdf', '1. Regular Expressions - Literals.ipynb', '1.6.2-packet-tracer----configure-basic-router-settings---physical-mode.pka', '14.3.5-packet-tracer---basic-router-configuration-review.pka', '15.6.1-packet-tracer---configure-ipv4-and-ipv6-static-and-default-routes.pka', '1982   2021 Employed persons by industry and state.csv', '1_SMART_Financial_Goals_Worksheet - SN01082115.docx', '1_SMART_Financial_Goals_Worksheet - SN01082115.pdf', '1_SMART_Financial_Goals_Worksheet.docx', '2. Regular Expressions - Metacharacters.ipynb', '2.2.13 Packet Tracer - Point-to-Point Single-Area OSPFv2 Configuration.pdf', '2.2.13 Packet Tracer - Point-to-Point Single-Area OSPFv2 Configuration_COMPLETE.pdf', '2.3.11 Packet Tracer - Determine the DR and BDR.pdf', '2.3.11 Packet Tracer - Determine the DR and BDR_COMPLETE.docx', '2.3.11 Packet Tracer - Determine the DR and BDR_COMPLETE.pdf', '2.4.11 Packet Tracer - Modif