In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import random
from concurrent.futures import ThreadPoolExecutor, as_completed


file_Path = r'Your file path!'
# if you wanna use SQL:please alter the code
URL_list = pd.read_excel(file_Path, sheet_name = 'your sheet name')['your columns'].to_list()

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.864.48 Safari/537.36 Edg/91.0.864.48",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.818.51 Safari/537.36 Edg/90.0.818.51",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.774.63 Safari/537.36 Edg/89.0.774.63"
]

proxies = {
    "http": "socks5h://127.0.0.1:8443",
    "https": "socks5h://127.0.0.1:8443",
}

def get_soup(url, max_retries=3):
    for _ in range(max_retries):
        try:
            headers = {
                'User-Agent': random.choice(USER_AGENTS),
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Referer': 'https://www.google.com/',
                'Connection': 'keep-alive',
            }
            resp = requests.get(url=url, headers=headers, proxies=proxies)
            return BeautifulSoup(resp.text, 'html.parser')
        except Exception as e:
            print(f"Error fetching {url}: {e}")
    return None

def process_url(url):
    soup = get_soup(url)
    if not soup:
        return {"sellerName": "-", "30daysRating": "-", "90daysRating": "-", "365daysRating": "-", "lifetimeRating": "-",
                "reviews": [], "5starPercentage": "-", "4starPercentage": "-", "3starPercentage": "-", "2starPercentage": "-", "1starPercentage": "-"}

    sellerName = soup.find('h1', {'id': "seller-name"})
    sellerName = sellerName.text.strip() if sellerName else "-"

    sellerRating_30days = soup.find('span', {'id': "effective-timeperiod-rating-thirty-description"})
    sellerRating_30days = sellerRating_30days.text.strip() if sellerRating_30days else "-"
    sellerRating_90days = soup.find('span', {'id': "effective-timeperiod-rating-ninety-description"})
    sellerRating_90days = sellerRating_90days.text.strip() if sellerRating_90days else "-"
    sellerRating_365days = soup.find('span', {'id': "effective-timeperiod-rating-year-description"})
    sellerRating_365days = sellerRating_365days.text.strip() if sellerRating_365days else "-"
    sellerRating_lifeTime = soup.find('span', {'id': "effective-timeperiod-rating-lifetime-description"})
    sellerRating_lifeTime = sellerRating_lifeTime.text.strip() if sellerRating_lifeTime else "-"

    reviews = soup.find_all('span', {'id': "-text"})
    reviewsList = [review.text.strip() for review in reviews] if reviews else []

    rating_table = soup.find('table', {'id': 'ratingHistogram'})
    if rating_table:
        five_star_percent = rating_table.find('span', {'id': 'percentFiveStar'})
        five_star_percent = five_star_percent.text.strip() if five_star_percent else "-"
        four_star_percent = rating_table.find('span', {'id': 'percentFourStar'})
        four_star_percent = four_star_percent.text.strip() if four_star_percent else "-"
        three_star_percent = rating_table.find('span', {'id': 'percentThreeStar'})
        three_star_percent = three_star_percent.text.strip() if three_star_percent else "-"
        two_star_percent = rating_table.find('span', {'id': 'percentTwoStar'})
        two_star_percent = two_star_percent.text.strip() if two_star_percent else "-"
        one_star_percent = rating_table.find('span', {'id': 'percentOneStar'})
        one_star_percent = one_star_percent.text.strip() if one_star_percent else "-"
    else:
        five_star_percent = four_star_percent = three_star_percent = two_star_percent = one_star_percent = "-"

    return {
        'seller': url,
        "sellerName": sellerName,
        "30daysRating": sellerRating_30days,
        "90daysRating": sellerRating_90days,
        "365daysRating": sellerRating_365days,
        "lifetimeRating": sellerRating_lifeTime,
        "reviews": reviewsList,
        "5starPercentage": five_star_percent,
        "4starPercentage": four_star_percent,
        "3starPercentage": three_star_percent,
        "2starPercentage": two_star_percent,
        "1starPercentage": one_star_percent
    }

def main():
    results = {}
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(process_url, url): url for url in URL_list}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                results[url] = data
            except Exception as exc:
                print(f"{url} generated an exception: {exc}")

    # 确保结果按照 URL_list 的顺序
    ordered_results = [results[url] for url in URL_list]
    df = pd.DataFrame(ordered_results)
    df.to_excel(r'your file path!')

if __name__ == "__main__":
    main()