Import libraries


In [132]:
import requests
from bs4 import BeautifulSoup
import re
import os
import csv

Get html from an url

In [133]:
def get_html(url: str) -> str:
    """Return the html from the url"""

    response = requests.get(url)
    if not response.ok:
        print(f"Code: {response.status_code}, url: {url}")
    return response.text

In [134]:
url = "https://store.steampowered.com/search/?tags=492"
BeautifulSoup(get_html(url), "html.parser")

<!DOCTYPE html>

<html class="responsive" lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="#171a21" name="theme-color"/>
<title>Steam Search</title>
<link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="https://store.akamai.steamstatic.com/public/shared/css/motiva_sans.css?v=Rc2hpzg2Ex3T&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://store.akamai.steamstatic.com/public/shared/css/shared_global.css?v=wy0wYJxBlt-2&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://store.akamai.steamstatic.com/public/shared/css/buttons.css?v=CrrkDubPqLcq&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://store.akamai.steamstatic.com/public/css/v6/store.css?v=sNEfGU2Bn0SL&amp;l=english" rel="stylesheet" type="text/css"/>
<link href="https://store.akamai.steamstatic.com/public/css/v6/browse.css?v=wWw5t

Datas to be extracted:
- Game's id
- Title
- Released date
- Overall review
- Positive Review Percentage
- Number of reviews
- Tags

In [135]:
def write_csv(data: dict, fname: str):
    fields = ["id", "title", "released", "overall", "positive%", "reviews", "tags"]
    exist_file = os.path.isfile(fname)

    with open(fname, "a") as file:
        writer = csv.DictWriter(file, fieldnames=fields)
        if not exist_file:
            writer.writeheader()
        writer.writerow(data)

Data extraction:

In [136]:
def get_steam_games(html: str) -> list:
    """Return the list of all games found on the html"""

    soup = BeautifulSoup(html, "html.parser")
    pattern = r"^https://store.steampowered.com/app/"

    games = soup.find_all("a", href=re.compile(pattern))
    return games

In [137]:
def get_id(game):
    """Return the id of a game"""
    try:
        id = game.get("data-ds-appid")
    except:
        id = ""
    return id

In [138]:
def get_review_data(game):
    """Return the overall user review, poitive review percantage and number of reviews"""
    try:
        raw_review_data = game.find(
            "div", class_="col search_reviewscore responsive_secondrow"
        ).span.get("data-tooltip-html")
    except:
        overall_review = ""
        poitive_review_percantage = ""
        number_of_reviews = ""
    else: 
        review_data =  re.split("<br>|%" , raw_review_data)
        overall_review = review_data[0]
        poitive_review_percantage = int("".join(re.findall(r"\d+", review_data[1]))) / 100
        number_of_reviews = int("".join(re.findall(r"\d+", review_data[2])))
    return overall_review, poitive_review_percantage, number_of_reviews

In [139]:
url = "https://store.steampowered.com/search/results/?query=&start=0&count=100&dynamic_data=&tags=492,4191,1684,4106,4305,597"
for i, game in enumerate(get_steam_games(get_html(url))):
    if i >= 10:
        break
    print(i, get_id(game), get_review_data(game))

0 2059660 ('Overwhelmingly Positive', 0.97, 508)
1 1518220 ('Very Positive', 0.85, 132)
2 1122720 ('Overwhelmingly Positive', 0.97, 7001)
3 1015890 ('Mostly Positive', 0.75, 418)
4 1995330 ('Positive', 0.87, 16)
5 1509750 ('Positive', 1.0, 11)
6 2336290 ('Positive', 0.83, 24)
7 2305490 ('Positive', 0.91, 48)
8 2392040 ('Very Positive', 0.8, 113)
9 1601850 ('Very Positive', 0.81, 86)


In [140]:
def get_hover_data(id):
    """Return game's title, released date and tags"""
    url = f"https://store.steampowered.com/apphoverpublic/{id}"
    print("Getting data from " + url)
    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")

    # Get game's title
    try:
        title = soup.find("h4", class_="hover_title").text.strip()
    except:
        title = ""
        print(url)


    # Get game's released date
    try:
        released = soup.find("div", class_="hover_release").span.text.split(":")[-1].strip()
    except:
        released = ""
        print(url)


    # Get user tags
    try:
        tags_raw = soup.find_all("div", class_="app_tag")
    except:
        tags = ""
        print(url)
    else:
        tags = [tag.text for tag in tags_raw]
        tags = ", ".join(tags)


    return title, released, tags

In [141]:
def scraper(to_file: str, start: int = 0, count: int = 100, limit: int|None = None, tags: str = ""):
    all_games = []
    run_scrapper = True
    while run_scrapper:
        if limit is not None and start + count >= limit:
            if start + count >= limit:
                count = limit - start
                run_scrapper = False
        url = f"https://store.steampowered.com/search/results/?query=&start={start}&count={count}&tags={tags}"
        print(url)
        games = get_steam_games(get_html(url))
        if games:
            all_games.extend(games)
            start += count
        else:
            run_scrapper = False

    print(f"Scraper found {len(all_games)} games")
    for game in all_games:
        id = get_id(game)
        overall_review, poitive_review_percantage, number_of_reviews = get_review_data(game)
        title, released, tags = get_hover_data(id)
        write_csv({
            "id": id,
            "title": title,
            "released": released,
            "overall": overall_review,
            "positive%": poitive_review_percantage,
            "reviews": number_of_reviews,
            "tags": tags
            }, fname=to_file)

Get reviews

In [144]:
scraper(to_file= "result1.csv", 
        tags="",
        limit=1_000)
"492,4191,1684,4106,4305,597"

https://store.steampowered.com/search/results/?query=&start=0&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=100&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=200&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=300&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=400&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=500&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=600&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=700&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=800&count=100&tags=
https://store.steampowered.com/search/results/?query=&start=900&count=100&tags=
Scraper found 982 games
Getting data from https://store.steampowered.com/apphoverpublic/730
Getting data from https://store.steampowered.com/apphoverpublic/553850
Getting data from https://store.steampo

'492,4191,1684,4106,4305,597'

https://store.steampowered.com/apphoverpublic/892970


In [153]:
import requests


def get_reviews(appid, params={"json": 1}):
    url = "https://store.steampowered.com/appreviews/"
    response = requests.get(
        url=url + appid, params=params
    )
    return response.json()


def get_n_reviews(appid, n=100):
    reviews = []
    cursor = "*"
    params = {
        "json": 1,
        "filter": "all",
        "language": "english",
        "day_range": 9223372036854775807,
        "review_type": "all",
        "purchase_type": "all",
    }

    while n > 0:
        params["cursor"] = cursor.encode()
        params["num_per_page"] = min(100, n)
        n -= 100

        response = get_reviews(appid, params)
        cursor = response["cursor"]
        reviews += response["reviews"]

        if len(response["reviews"]) < 100:
            break

    return reviews

In [156]:
len(get_n_reviews("591680"))

100