In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
# --- Set Up ---
# Start Driver
driver = webdriver.Chrome()

# Access Reference Page
url = 'https://forums.warframe.com/topic/1453670-dev-workshop-isleweaver-valkyr-rework/#comment-13297757'
driver.get(url)
time.sleep(5)

# Find max page number
xpath = '//input[contains(@type, "number") and contains(@class, "ipsField_fullWidth")]'
items = driver.find_element(By.XPATH, xpath)
max_page = int(items.get_attribute('max'))

# List of pages with comments
pages = [num_page for num_page in range(max_page-1, 1)]
for idx in range(1, max_page):
    pages.append(f'https://forums.warframe.com/topic/1453670-dev-workshop-isleweaver-valkyr-rework/page/{idx}/#comments')

driver.quit()

# --- Extract data from every page ---
# Automatically open Inspector (F12)
# If this isn't added, cannot get author names and comment post date (??)
options = webdriver.ChromeOptions() 
options.add_argument("--auto-open-devtools-for-tabs")

data_dict = {'author': [], 'likes': [], 'date': [], 'comment': []}

for url in pages:
    # Start a new driver to avoid Cloudflare human check 
    # (still legal based on robots.txt)
    driver = webdriver.Chrome(options=options)

    # Access Reference Page
    driver.get(url)

    # Find comment boxes
    xpath = '//article[contains(@class, "cPost ipsBox ipsResponsive_pull")]'
    items = driver.find_elements(By.XPATH, xpath)

    # Extract data        
    for item in items:
        # Comment's text without leading and trailing whitespaces
        text_list = [i.text for i in item.find_elements(By.TAG_NAME, 'p')]
        data_dict['comment'].append(' '.join(text_list).strip())

        # Author and date from authorPane class
        authorPane = item.find_element(By.CLASS_NAME, 'cAuthorPane_content')
        name = authorPane.find_element(By.TAG_NAME, 'span').text
        date = authorPane.find_element(By.TAG_NAME, 'time').text
        
        data_dict['author'].append(name)
        data_dict['date'].append(date)

        # Likes
        try:
            # Locate Like Counter
            likeCountButton = item.find_element(By.CLASS_NAME, 'ipsReact_reactCount')
            likes = likeCountButton.find_element(By.TAG_NAME, 'span').text
        except:
            # No counter found
            likes = 0
        finally:
            data_dict['likes'].append(likes)
    
    # Close current tab
    driver.quit()
    time.sleep(5)


KeyboardInterrupt: 

In [4]:
import pandas as pd
import pickle

# Transform to dataframe
df = pd.DataFrame.from_dict(data_dict)

# Store data
with open("data/forum_posts.pkl", "wb") as f:
    pickle.dump(df, f)

# Display results
df.head()

Unnamed: 0,author,likes,date,comment
0,[DE]Sam,102,May 9,"VALKYR\nTenno, A berserker’s fury fills the ai..."
1,oswarlan,352,May 10,"Honestly, losing the invulnerability on Hyster..."
2,StageNameFrank,49,May 10,"finally, valkyr's slide attack isn't the only ..."
3,18K-,10,May 10,Love it!
4,dice,18,May 10,Valkyr's Warcry is now recastable. Hooray!


In [5]:
url

'https://forums.warframe.com/topic/1453670-dev-workshop-isleweaver-valkyr-rework/page/99/#comments'