## Introduction

In order to perform sentiment analysis on user reviews for the selected code repositories, a CSV file of the user reviews and their corresponding game title is required. The web scrape to produce this will be coming from specific urls from https://backloggd.com/

In [1]:
# imports needed
import requests
from bs4 import BeautifulSoup # library used for html web scraping!
import soupsieve as sv # library used especially for accessing specific elements in retrieved html responses
import time
import pandas as pd


# the following Selenium imports are used before creating soup objects since things like the review body texts are not loaded in with the html 
# but after (with javascript)
# these allow us to access the full page source with beautiful soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# defining constants / urls
BASE_URL = 'https://www.backloggd.com/games/'

GAME_URLS = ['daggerfall-unity/', 'luanti/', 'openra/', 'supertux/', 'battle-for-wesnoth/', 'mindustry/', 'shattered-pixel-dungeon/', 
             'triplea/', 'openttd/', 'nethack/']

REVIEW_URL = 'reviews/'

num_games = len(GAME_URLS)

# create pandas data frame to hold the stuff
cols = ['Title', 'Review']
df_games = pd.DataFrame(columns=cols)

In [3]:
# selenium set up, NOTE - this cell was generated with the assistance of google gemini:

# Setup: Configure Chrome to run without a GUI (headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

# Initialize the WebDriver
driver = webdriver.Chrome(options=options) 
# Note: Adjust the path/initialization if you are not using Chrome or need to specify a driver path

def fetch_page_with_selenium(url):
    """Scrolls down repeatedly until the page height stops increasing, 
    indicating all content (reviews) has been loaded."""
    
    # 1. Navigate to the URL
    driver.get(url)
    
    # Give the initial content time to load
    time.sleep(5) 

    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # 2. Scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # 3. Wait for new content to load (adjust this time if needed)
        time.sleep(5) 
        
        # 4. Get the new scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        # 5. Check if we've hit the end
        if new_height == last_height:
            # If the height hasn't changed, we've loaded everything.
            break
        
        last_height = new_height
        
    # Now the page source should contain all 30 reviews
    return driver.page_source

In [4]:
# test html access
success = True
for game in GAME_URLS:
    url = BASE_URL + game + REVIEW_URL
    response = requests.get(url)
    if (response.status_code != 200):
        print("error accessing site:", url)
        success = False

if(success):
    print("Accessing all sites succeeded.")

Accessing all sites succeeded.


In [5]:
# this basically isolates specific html tags, and then gets their text
# when isolating css tags, they aren't guaranteed to be "filled" this why I check for "none"  in order to avoid any attribute errors
# note: the following css selectors were chosen from identifying where the texts we need live in the site's html hierarchy (site inspect)

def scrape_game_data(page_source):
    # create a soup object
    soup = BeautifulSoup(page_source, "html.parser")
    
    title = sv.select_one('p.reviews-game-title a', soup).text
   
    review_elements = sv.select('.review-body .card-text', soup)
    
    all_reviews = []
    
    if review_elements:
        for element in review_elements:
            all_reviews.append(element.text.strip())
    else:
        all_reviews = ["No reviews found"]
    
    return [title, all_reviews]

In [6]:
# start collecting info from each game's backloggd review page

for game in GAME_URLS:
    url = BASE_URL + game + REVIEW_URL
 
    page_source = fetch_page_with_selenium(url)
    
    title, all_reviews = scrape_game_data(page_source)

    # add to data frame in a nice way (one review per row)
    for review in all_reviews:
        new_row = [title, review]
        df_games.loc[len(df_games)] = new_row

# close selenium driver to prevent cpu slowdown and memory leakage
driver.quit()
 
print("Finished loading data frame object with data.")

Finished loading data frame object with data.


In [7]:
# convert the data frame to a csv
df_games.to_csv("output_data/backloggd_game_reviews.csv", index = False)
print("All Done! :3")

All Done! :3
