# Goal is to create parallel corpus for translation model training. Data is scraped from Steam store. 

# Function for getting list of games (appids)

In [1]:
import requests
from bs4 import BeautifulSoup

# Define the URL for the Steam search page
url = "https://store.steampowered.com/search/?ndl=1"

# List to store the retrieved values
data_ds_appids = []

# Parameters for the search request
params = {
    "query": "",
    "start": 0,
    "count": 25
}

# Loop through the desired number of pages
for page in range(2):
    # Update the 'start' parameter based on the current page
    params["start"] = 25 * page

    # Send a GET request to the Steam search page with the specified parameters
    data = requests.get(url, params=params)

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(data.content, "html.parser")

    # Find the <div> element that contains the search results
    game_appid = soup.find('div', {'class': 'search_results', 'id': 'search_results'})

    # Find all <a> elements within the search results
    game_appid_elements = game_appid.find_all('a')

    # Loop through each <a> element
    for element in game_appid_elements:
        # Retrieve the value of the 'data-ds-appid' attribute
        data_ds_appid = element.get('data-ds-appid')
        
        # Check if the attribute exists
        if data_ds_appid is not None:
            # Add the value to the list
            data_ds_appids.append(data_ds_appid)


# Function for getting game description's language variations

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import html2text

# Regular expression pattern to match markdown image syntax
p = re.compile(r'!\[\]\(.*?\)')

def scrape_steam_content(language, id_appid):
    # Construct the URL for the Steam app page based on the app ID
    url = 'https://store.steampowered.com/app/' + str(id_appid)

    # Specify the Accept-Language header for the desired language
    headers = {'Accept-Language': language}

    # Send a GET request to the app page with the specified headers
    response = requests.get(url, headers=headers)
    #print(url)

    if response.status_code == 200:
        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the <div> element that contains the game description
        game_description = soup.find('div', {'class': 'game_area_description', 'id': 'game_area_description'})

        if game_description:
            # Convert HTML to plain text using html2text library
            h = html2text.HTML2Text()
            h.body_width = 0
            h.ignore_links = True
            h.ignore_emphasis = True
            plain_text = h.handle(str(game_description))

            # Remove markdown image syntax
            plain_text = p.sub(r'', plain_text)

            # Replace markdown header syntax
            plain_text = plain_text.replace(u"# ", "")
            plain_text = plain_text.replace(u"##", "")

            # Split the text into lines
            lines = plain_text.split('\n')

            # Remove the first line (typically the game's title)
            lines = lines[1:]

            modified_lines = []
            prev_line_empty = False

            for i in range(len(lines)):
                line = lines[i].strip()
                if line != '':
                    modified_lines.append(line)

                    # Add a period after an empty line followed by a capitalized line
                    if prev_line_empty and i < len(lines) - 1 and lines[i + 1].strip() != '' and lines[i + 1].strip()[0].isupper():
                        modified_lines.append('.')
                    prev_line_empty = False
                else:
                    prev_line_empty = True

            modified_text = ' '.join(modified_lines)

            # Remove spaces before periods
            modified_text = modified_text.replace(u" .", ".")

            # print(modified_text)

            return modified_text
        else:
            print('Game description not found.')
    else:
        print('Failed to scrape content. Status Code:', response.status_code)


# Scraping the content into DataFrame

In [3]:
from langdetect import detect
import pandas as pd

# List of languages to scrape content in
languages = ['cs', 'en']  # Add more languages here if desired - 'es', 'fr', 'de', 'it'

# Create an empty DataFrame to store the scraped content
scraped_content = pd.DataFrame(columns=['game_appid', 'language', 'content'])

# Loop through each app ID
for appid_r in data_ds_appids:
    print('Scraping content of game', appid_r)

    # Loop through each language
    for language in languages:
        # Scrape the content for the current app ID and language
        temp_scrape = scrape_steam_content(language, appid_r)

        if temp_scrape == '':
            print('No content')
            break
        elif detect(temp_scrape) == language:
            print(language)
            # Append the scraped content to the DataFrame
            scraped_content = pd.concat(
                [scraped_content, pd.DataFrame({'game_appid': appid_r, 'language': language, 'content': temp_scrape}, index=[0])]   
            )
        else:
            print('No ',language,' version')
            break


Scraping content of game 730
cs
en
Scraping content of game 671860
cs
en
Scraping content of game 1551360
cs
en
Scraping content of game 1172470
cs
en
Scraping content of game 518790
cs
en
Scraping content of game 271590
cs
en
Scraping content of game 2108330
cs
en
Scraping content of game 281990
No  cs  version
Scraping content of game 381210
No  cs  version
Scraping content of game 236390
cs
en
Scraping content of game 548430
No  cs  version
Scraping content of game 1150440
No  cs  version
Scraping content of game 570
cs
en
Scraping content of game 252490
No  cs  version
Scraping content of game 1938090
No  cs  version
Scraping content of game 703080
No  cs  version
Scraping content of game 1599340
No  cs  version
Scraping content of game 1091500
cs
en
Scraping content of game 306130
cs
en
Scraping content of game 294100
No  cs  version
Scraping content of game 916440
No  cs  version
Scraping content of game 359550
No  cs  version
Scraping content of game 990080
No  cs  version
Scrap

In [4]:
# Note
# remove non-character symbols ?
    # * ■ ⭐

# Output the results

In [5]:
# Pivot the table
pivoted_content = scraped_content.pivot(index='game_appid', columns='language', values='content').reset_index()
pivoted_content

language,game_appid,cs,en
0,1091500,Cyberpunk 2077 je akční dobrodružné RPG v otev...,"Cyberpunk 2077 is an open-world, action-advent..."
1,1172470,"Vítězte stylově ve hře Apex Legends, bezplatné...","Conquer with character in Apex Legends, a free..."
2,1222670,Dejte průchod své představivosti a vytvořte je...,Unleash your imagination and create a unique w...
3,1245620,"NOVÉ FANTASY AKČNÍ RPG.. Povstaň, Poskvrněný, ...","THE NEW FANTASY ACTION RPG.. Rise, Tarnished, ..."
4,1248130,"Staň se moderním farmářem! Zemědělství, chov z...",Take on the role of a modern farmer! Agricultu...
5,1551360,Tvoje ultimátní dobrodružství Horizonu čeká! P...,Your Ultimate Horizon Adventure awaits! Explor...
6,2108330,"Brzděte jako poslední v EA SPORTS™ F1® 23, ofi...","Be the last to brake in EA SPORTS™ F1® 23, the..."
7,227300,"Procestuje Evropou jako král silnic, jako řidi...","Travel across Europe as king of the road, a tr..."
8,236390,"War Thunder je obrovská free-to-play, cross-pl...",War Thunder is the most comprehensive free-to-...
9,271590,"Když se mladý zlodějíček, bankovní lupič na od...","When a young street hustler, a retired bank ro..."


In [6]:
# Export DataFrame to CSV file
pivoted_content.to_csv('pivoted_content.csv', index=False)

# Export languages (en, cs) to separate text files
pivoted_content['cs'].to_csv('cs_content.txt', index=False, header=False)
pivoted_content['en'].to_csv('en_content.txt', index=False, header=False)