<h2 style = "font-family: Serif; font-size: 35px; font-style: normal; letter-spacing: 3px; background-color: #f6f6f6; color: #0D0D0F; border-radius: 100px 100px; text-align: center;"> 2.1 WEB SCRAPING TO OBTAIN DATA </h2>

This file contains the code for scraping Steam reviews for the game [**Black Myth: Wukong**](https://steamcommunity.com/app/2358720/reviews/). It is the **Section 2.1** of the main file `end-to-end-sentiment-analysis.ipynb`.

# Import Libraries

In [None]:
# Import Modules for Web Scraping
import selenium
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time

# Information of the Game & Steam Review Page

In [None]:
# Information about the webpage we are going to scrape

# Game id of the game we chose
game_id = 2358720 # Change if want to scrape reviews for other games

# Set url template
url_template = "https://steamcommunity.com/app/{}/reviews/?p=1&browsefilter=mostrecent&filterLanguage=english"

# Get the url we want
url = url_template.format(game_id)

# Print the url to check if it is what we want
print(url)

# Setup

In [None]:
# RUN THIS FIRST TO SETUP

# Set up and open the relevant browser
# Set up EdgeOptions
options = Options()
options.use_chromium = True  # Ensures the use of Chromium-based Edge
language = "en-US"
options.add_argument(f"--lang={language}")

# Path to the msedgedriver executable
webdriver_path = "___.exe" # Insert your own webdriver path into '___'

# Set up the service for Edge
service = Service(executable_path=webdriver_path)

# Initialize the Edge driver with the options and service
driver = webdriver.Edge(service=service, options=options)

# Maximize the window
driver.maximize_window()

# Navigate to our URL
driver.get(url)

In [None]:
# RUN THIS AFTER FINISH SETTING UP

# Close the browser after we are done
driver.quit()

# Scrape Reviews (Data)

In [None]:
# DEFINE FUNCTIONS TO SCRAPE

# Define functions to scrape data (reviews) from the review page
# Define a function to get the current scroll position
def get_current_scroll_position(driver):
    '''
    Get the current scroll position.
    '''
    return driver.execute_script("return window.pageYOffset;")

# Define a function to scroll the review page to the bottom
def scroll_to_bottom(driver):
    '''
    Scroll the review page to the bottom.
    '''
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

# Define a function to get the user ID associated with the reviews
def get_steam_id(card):
    '''
    Get the User ID associated with each review.
    '''
    profile_url = card.find_element(By.XPATH, './/div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')
    steam_id = profile_url.split('/')[-2]
    return steam_id

# Define a function to scrape the data (review)
def scrape_review_data(card):
    '''
    Scrape the data (review) from the review page.
    '''
    # Find the date posted of reviews
    date_posted_element = card.find_element(By.XPATH, './/div[@class="apphub_CardTextContent"]/div[@class="date_posted"]')
    date_posted = date_posted_element.text.strip()

    # Clean the data to obtain the reviews
    card_text_content_element = card.find_element(By.CLASS_NAME, "apphub_CardTextContent")
    review_content = card_text_content_element.text.strip()
    review_content = review_content.replace(date_posted, "")
    review_content = review_content.replace("\n", "")

    # Find the length of reviews
    review_length = len(review_content.replace(" ", ""))

    # Thumbs up (Recommended) OR thumbs down (NOT Recommended)
    thumb_text = card.find_element(By.XPATH, './/div[@class="reviewInfo"]/div[2]').text

    # Number of hours played
    play_hours = card.find_element(By.XPATH, './/div[@class="reviewInfo"]/div[3]').text

    return review_content, thumb_text, review_length, play_hours, date_posted

In [None]:
# START SCRAPING

# Retrieve reviews from the review page
# Initialize and set variables
reviews = []
steam_ids_set = set() # Avoid duplicates by checking if reviews have the same IDs
max_scroll_attempts = 5

# Stop getting data (reviews) if we reach end of the review page
try:
    last_position = get_current_scroll_position(driver)

    # Loop which we will get the reviews
    running = True
    while running:
        cards = driver.find_elements(By.CLASS_NAME, 'apphub_Card') # Can be found by 'inspect'

        # Loop to check IDs and add the data (reviews) into our 'reviews' list for new/unseen IDs
        for card in cards[-10:]:
            steam_id = get_steam_id(card)
            if steam_id in steam_ids_set: # If we have already added the review associated to the ID, then skip
                continue
            else:
                review = scrape_review_data(card)
                reviews.append(review)

        # Check (and stop) when it reached the bottom of the review page
        scroll_attempt = 0
        while scroll_attempt < max_scroll_attempts:
            scroll_to_bottom(driver)
            current_position = get_current_scroll_position(driver)

            if current_position == last_position:
                scroll_attempt += 1
                time.sleep(3)

                if current_position >= 3:
                    runnning = False
                    break
                    
            else:
                last_position = current_position
                break
        
except Exception as e:
    print(e)

finally:
    driver.quit()

# Create DataFrame & Save As CSV File

In [None]:
# INSERT SCRAPED DATA INTO A DATAFRAME & DROP DUPLICATED ROWS

# Create a DataFrame to insert our data
df = pd.DataFrame(reviews, columns = ['ReviewText', 'RecommendedOrNot', 'ReviewLength', 'PlayHours', 'DatePosted'])

# Drop the duplicated rows
df = df.drop_duplicates()

In [None]:
# Save our dataset into CSV file
df.to_csv('wukong-dataset.csv', index=False)