# Scraping

## Metacritic

Scrapes the games from `metacritic_games.txt` on https://metacritic.com.

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
print("INIT module_path: ", module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

DATA_DIR = module_path + "/data"

from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import bs4
import time
import pandas as pd
from tqdm.notebook import tqdm

def scrape_metacritic_reviews(reviews_per_game: int = 200, games_file: str = "metacritic_games.txt", out_file: str = "metacritic_reviews.csv"):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)

    games = []
    with open(f"{DATA_DIR}/{games_file}", "r") as out_file:
        games = out_file.read().splitlines()

    rows_list: list[list] = []
    def scrape_game(i: int, game: str):
        nonlocal rows_list
        driver.get(f"https://www.metacritic.com/game/{game}/user-reviews")
        # Scroll until we get >= MAX_REVIEWS_PER_GAME 
        SCROLL_PAUSE_TIME = 0.5
        last_height = driver.execute_script("return document.body.scrollHeight")
        elems: list[webelement.WebElement] = []
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)
            new_height = driver.execute_script("return document.body.scrollHeight")
            elems = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='product-review']")
            if new_height == last_height or len(elems) >= reviews_per_game:
                break
            last_height = new_height

        # Use BeautifulSoup to do actual parsing, since it's way faster
        soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
        all_reviews = soup.find_all("div", {"data-testid": "product-review"})[:reviews_per_game]
        for elem in all_reviews:
            user = elem.find("a", "c-siteReviewHeader_username").text
            review_score = elem.find("div", "c-siteReviewScore").find("span").text
            review = elem.find("div", "c-siteReview_quote").find("span").text
            rows_list.append(["metacritic", user.strip(), game, review.strip(), review_score.strip(), "10"])
        print(f"    {game:<40} {f'{len(all_reviews)} reviews':<20} ({i + 1}/{len(games)}) games")
        
    start_pc = time.perf_counter()
    print(f"Scraping Metacritic ({len(games)} games):")
    print(f"  games_file: {games_file}")
    print(f"  out_file: {out_file}")
    for i, game in enumerate(games):
        scrape_game(i, game)
    end_pc = time.perf_counter()
    print(f"Total review count: {len(rows_list)}")
    print(f"Time elapsed: {end_pc - start_pc} seconds")

    df = pd.DataFrame(rows_list, columns=['site', 'user', 'review_target', 'review', 'score', 'max_score'])
    df.to_csv(f"{DATA_DIR}/{out_file}", index=False)

    driver.quit()

scrape_metacritic_reviews()

INIT module_path:  c:\Users\Alan\Desktop\Open_Source\BERT-TLSA-paper
Scraping Metacritic...
    minecraft                                200 reviews          (1/110) games
    a-short-hike                             110 reviews          (2/110) games
    shovel-knight                            88 reviews           (3/110) games
    celeste                                  200 reviews          (4/110) games
    hollow-knight                            150 reviews          (5/110) games
    doki-doki-literature-club-plus           42 reviews           (6/110) games
    undertale                                200 reviews          (7/110) games
    mortal-kombat-11                         200 reviews          (8/110) games
    guilty-gear-strive-                      38 reviews           (9/110) games
    guilty-gear-xrd-revelator-               21 reviews           (10/110) games
    injustice-2                              160 reviews          (11/110) games
    street-fighter-6      

## Steam

Scrapes the games from `steam_games.txt` on https://steamcommunity.com.

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
print("INIT module_path: ", module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

DATA_DIR = module_path + "/data"

from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import bs4
import time
import pandas as pd
from tqdm.notebook import tqdm

def scrape_steam_games(count: int = 100, file: str = "steam_games.csv"):
    start_pc = time.perf_counter()
    
    print(f"Scraping top {count} Steam games...")
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    driver.get("https://store.steampowered.com/search?category1=998&supportedlang=english&ndl=1")
    
    # Scroll until we get >= count 
    SCROLL_PAUSE_TIME = 0.5
    last_height = driver.execute_script("return document.body.scrollHeight")
    elems: list[webelement.WebElement] = []
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight")
        elems = driver.find_elements(By.CSS_SELECTOR, "a[data-ds-appid]")
        if new_height == last_height or len(elems) >= count:
            break
        last_height = new_height
    
    # Use BeautifulSoup to do actual parsing, since it's way faster
    soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
    all_games = soup.find_all("a", {"data-ds-appid": True})[:count]
    
    rows_list: list[list] = []
    for game in all_games:
        name = game.select_one("div > div > span.title").text
        app_id = game["data-ds-appid"]
        rows_list.append([name, app_id])

    df = pd.DataFrame(rows_list, columns=['name', 'app_id'])
    df.to_csv(f"{DATA_DIR}/{file}", index=False)

    end_pc = time.perf_counter()
    print(f"Total games count: {len(rows_list)}")
    print(f"Time elapsed: {end_pc - start_pc} seconds")

    driver.quit() 


def scrape_steam_reviews(reviews_per_game: int = 200, games_file: str = "steam_games.csv", out_file: str = "steam_reviews.csv"):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)

    games_df = pd.read_csv(f"{DATA_DIR}/{games_file}")
   
    total_review_count = 0
    def scrape_game(i: int, game: str, app_id: str):
        nonlocal total_review_count, reviews_per_game, games_df
        driver.get(f"https://steamcommunity.com/app/{app_id}/reviews/?p=1&browsefilter=toprated")

        try:
            filter_content = driver.find_element(By.CSS_SELECTOR, ".contentcheck_btns_ctn > button[data-panel]")
            filter_content.click()
            time.sleep(0.5)
        except NoSuchElementException:
            pass

        all_reviews = []
        curr_rows_list = []
        if "/reviews" in driver.current_url:
            # Scroll until we get >= MAX_REVIEWS_PER_GAME 
            SCROLL_PAUSE_TIME = 0.5
            LOAD_INTERVAL = 0.2
            last_height = driver.execute_script("return document.body.scrollHeight")
            elems: list[webelement.WebElement] = []
            while True:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                time.sleep(SCROLL_PAUSE_TIME)
                
                # Idle while we are still loading:
                is_loading = True
                while is_loading:
                    time.sleep(LOAD_INTERVAL)
                    wait_elem = driver.find_element(By.CSS_SELECTOR, ".apphub_GetMoreContentWait")
                    is_loading = wait_elem.get_attribute("style").strip() != "display: none;"

                new_height = driver.execute_script("return document.body.scrollHeight")
                elems = driver.find_elements(By.CSS_SELECTOR, ".apphub_Card")
                if new_height == last_height or len(elems) >= reviews_per_game:
                    break
                last_height = new_height

            # Use BeautifulSoup to do actual parsing, since it's way faster
            soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
            all_reviews = soup.find_all("div", "apphub_Card")[:reviews_per_game]
            for elem in all_reviews:
                user = elem.select_one(".apphub_CardContentAuthorName").text
                review_score = 1 if elem.select_one(".reviewInfo > .title").text == "Recommended" else 0
                
                text_content_elem = elem.select_one(".apphub_CardTextContent")
                for s in text_content_elem.find_all('div'):
                    # Remove any non-review info, like early access and posted date
                    s.extract()
                review = text_content_elem.text

                curr_rows_list.append(["steam", user.strip(), game, review.strip(), review_score, "1"])
        print(f"    {game:<40} {f'{len(all_reviews)} reviews':<20} ({i + 1}/{len(games_df)}) games")
        total_review_count += len(all_reviews)

        df = pd.DataFrame(curr_rows_list, columns=['site', 'user', 'review_target', 'review', 'score', 'max_score'])
        if i == 0:
            # Overwrite the file and add a header if we are the first game is be written
            df.to_csv(f"{DATA_DIR}/{out_file}", index=False, header=True, mode="w")
        else:
            df.to_csv(f"{DATA_DIR}/{out_file}", index=False, header=False, mode="a")
        
    start_pc = time.perf_counter()
    print(f"Scraping Steam ({len(games_df)} games):")
    print(f"  games_file: {games_file}")
    print(f"  out_file: {out_file}")
    for i, row in games_df.iterrows():
        scrape_game(i, row["name"], row["app_id"])

    
    end_pc = time.perf_counter()
    print(f"Total review count: {total_review_count}")
    print(f"Time elapsed: {end_pc - start_pc} seconds")

    driver.quit()

# scrape_steam_games(1000)
scrape_steam_reviews()

INIT module_path:  c:\Users\Alan\Desktop\Open_Source\BERT-TLSA-paper
Scraping Steam (110 games):
  games_file: steam_games.csv
  out_file: steam_reviews.csv
    Schedule I                               200 reviews          (1/110) games
    Counter-Strike 2                         200 reviews          (2/110) games
    R.E.P.O.                                 200 reviews          (3/110) games
    RuneScape: Dragonwilds                   200 reviews          (4/110) games
    Marvel Rivals                            200 reviews          (5/110) games
    The Last of Us™ Part II Remastered       200 reviews          (6/110) games
    Warframe                                 200 reviews          (7/110) games
    Baldur's Gate 3                          200 reviews          (8/110) games
    Last Epoch                               200 reviews          (9/110) games
    Tom Clancy's Rainbow Six® Siege          200 reviews          (10/110) games
    Tempest Rising                        

## MyAnimeList

Scrapes the shows from `myanimelist_shows.txt` on https://myanimelist.net.

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
print("INIT module_path: ", module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

DATA_DIR = module_path + "/data"

import time
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

def scrape_myanimelist_reviews(shows_file: str = "myanimelist_shows.txt", out_file: str = "myanimelist_reviews.csv"):
    review_list = []  # Do not touch
    anime_watched = []

    # creating anime list
    anime_list = []
    with open(f"{DATA_DIR}/{shows_file}", "r") as file:
        anime_list = file.read().splitlines()

    print("Scraping MyAnimeList shows...")
    def fetch_animes():
        for anime in anime_list:
            print(f"Searching for anime: {anime}")
            url = f"https://api.jikan.moe/v4/anime?q={anime}&limit=1"
            response = requests.get(url)
            data = response.json()  # Creates the Json
            anime_id = data["data"][0]["mal_id"]
            for page in range(1, 6):
                reviews_url = (
                    f"https://api.jikan.moe/v4/anime/{anime_id}/reviews?page={page}"
                )
                review_response = requests.get(reviews_url)
                if review_response.status_code != 200:
                    break  # Stop if the request fails
                # Enters the review portion of the code
                reviews_data = review_response.json()["data"]

                for review in reviews_data:
                    review_entry = {
                        "site": "MyAnimeList",
                        "user": review["user"]["username"],
                        "review_target": anime,
                        "review": review["review"],
                        "score": review["score"],
                        "max_score": 10,
                    }
                    review_list.append(review_entry)
                if anime not in anime_watched:
                    anime_watched.append(anime)
            time.sleep(2)  # TOO many requests error

    fetch_animes()
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_anime = {executor.submit(fetch_animes): anime for anime in anime_list}
        for future in as_completed(future_to_anime):
            anime = future_to_anime[future]
            try:
                anime_reviews = future.result()
                if anime_reviews:
                    review_list.extend(anime_reviews)
                    anime_watched.append(anime)
            except Exception as e:
                print("")

    df = pd.DataFrame(review_list)
    df.to_csv(f"{DATA_DIR}/{out_file}", index=False)

    print(df.head(380))
    print("\n Number of reviews: ", len(review_list))
    print("\n Number of Animes: ", len(anime_watched))

scrape_myanimelist_reviews()

INIT module_path:  c:\Users\Alan\Desktop\Open_Source\BERT-TLSA-paper
Searching for anime: Shingeki no Kyojin
Searching for anime: Spirited away


KeyboardInterrupt: 

## Rotten Tomatoes

Scrapes the games from `rotten_tomatoes_movies.txt` on http://rottentomatoes.com/.

In [7]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
print("INIT module_path: ", module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

DATA_DIR = module_path + "/data"

import requests 
from bs4 import BeautifulSoup
import pandas as pd 

def scrape_rotten_tomatoes_reviews(movies_file: str = "rotten_tomatoes_movies.txt", out_file: str = "rotten_tomatoes_reviews.csv"):
    all_text = []

    def determine_name(movie):
        formatted_string = movie.replace('_', ' ').title()
        return formatted_string

    with open(f"{DATA_DIR}/{movies_file}", "r") as file:
        movies = file.read().splitlines()

    print("Scraping Rotten Tomatoes movies...")
    for movie in movies:
        url = f"https://www.rottentomatoes.com/m/{movie}/reviews?type=user"
        
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        divs = soup.find_all('div', {'class': 'audience-review-row'}) 
        curr_review_count = 0
        for x, block in enumerate(divs[:200],start = 1):
            review_block = block.find('p',class_ = "audience-reviews__review")
            score_block = block.find('rating-stars-group')
            reviewer_block = block.find('a', class_='audience-reviews__name')
            
            review = review_block.get_text(strip=True) if review_block else "No review text"    
            reviewer = reviewer_block.get_text(strip=True) if reviewer_block else " "
            score = score_block.get('score') if score_block else " "
            entry = {
                'site': "Rotten Tomatoes",
                'user': reviewer ,    
                'review_target': determine_name(movie),    
                'review': review,
                'score': score,  
                'max_score': 5   
            }
            curr_review_count += 1
            all_text.append(entry)
        print(f"Scraped movie: {movie} ({curr_review_count})")
            
            
    df = pd.DataFrame(all_text)
    df.to_csv(f"{DATA_DIR}/{out_file}", index=False)

    print(df.head(100))
    print("Number of reviews: ", len(all_text))
    print("Number of movies: ", len(movies))

scrape_rotten_tomatoes_reviews()

INIT module_path:  c:\Users\Alan\Desktop\Open_Source\BERT-TLSA-paper
Scraping Rotten Tomatoes movies...
Scraped movie: dune_2021 (20)
Scraped movie: the_matrix (0)
Scraped movie: inception (20)
Scraped movie: interstellar_2014 (20)
Scraped movie: the_dark_knight_2008 (0)
Scraped movie: the_shawshank_redemption (0)
Scraped movie: fight_club (20)
Scraped movie: pulp_fiction (20)
Scraped movie: forrest_gump (20)
Scraped movie: the_godfather (20)
Scraped movie: godfather_part_ii (20)
Scraped movie: the_dark_knight_rises (20)
Scraped movie: schindlers_list (20)
Scraped movie: the_lord_of_the_rings_the_return_of_the_king_2003 (0)
Scraped movie: x_men_days_of_future_past (20)
Scraped movie: x_men_apocalypse (20)
Scraped movie: toxic_avenger (20)
Scraped movie: big_hero_6 (20)
Scraped movie: last_days_in_vietnam (20)
Scraped movie: barbie (20)
Scraped movie: marvels_the_avengers (20)
Scraped movie: big (20)
Scraped movie: dil-chahta-hai (20)
Scraped movie: the_incredible_hulk (20)
Scraped movi