In [1]:
import requests
import time
import logging 
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import re

In [2]:
from pydantic import BaseModel, Field, HttpUrl, field_validator
from typing import Dict, List, Optional, Union

In [3]:
def get_date():
    return datetime.now().date()

In [4]:
class GameMetaData(BaseModel):
    appid: int = Field(...,description = "Steam Application ID")
    name: str = Field(...,description ="game's name")
    date_added: Optional[datetime] = Field(
        ..., default_factory=get_date, description="Date the appid was added"
    )
    dne:Optional[bool] = Field(False, description="Flag that the app id doesn't exist in the database")

In [5]:
class GameMetaDataList(BaseModel):
    games_metadata: List[GameMetaData] = Field(...,description= "list of games metadata")

In [6]:
class GameDetails(BaseModel):
    appid: int = Field(...,description = "Steam Application ID")
    name: str = Field(...,description ="game's name")
    developer: str = Field(...,description ="comma separated list of the developers of the game")
    publisher: str = Field(...,description ="comma separated list of the publishers of the game")
    score_rank: str = Field("",description ="score rank of the game based on user reviews")
    positive: int = Field(...,description ="number of positive reviews ")
    negative: int = Field(...,description ="number of negative reviews ")
    userscore: float = Field(...,description ="User score of the game") ,
    owners: str = Field(...,description ="owners of this application on Steam as a range.")
    average_forever: int = Field(...,description ="average playtime since March 2009. In minutes")
    average_2weeks: int = Field(...,description ="average playtime in the last two weeks. In minutes.")
    median_forever: int = Field(...,description ="median playtime since March 2009. In minutes")
    median_2weeks: int = Field(...,description ="median playtime in the last two weeks. In minutes")
    price: Optional[int]= Field(...,description ="current US price in cents")
    initialprice: Optional[int]= Field(...,description ="original US price in cents.")
    discount:  Optional[str]= Field(...,description ="current discount in percents.")
    ccu: int = Field("",description ="peak CCU yesterday")
    languages: Optional[str] = Field(None,description ="list of supported languages.")
    genre: Optional[str] = Field(None,description ="list of genres.")
    tags: Optional[dict[str,int]] = Field(None, description ="game's tags with votes in JSON array.")

In [7]:
class GameDetailsList(BaseModel):
    games: List[GameDetails] = Field(...,description= "list of games")

In [8]:
class SteamGameMetadata(BaseModel):
    type: str = Field(...,description ="game's type")
    name: str = Field(...,description ="game's name")
    appid: int = Field(...,description = "Steam Application ID")
    required_age: Optional[Union[int, str]]  = Field(...,description ="Required age to play the game")
    is_free: bool = Field(...,description ="is the game free")
    dlc: Optional[list[int]] = Field(...,description ="list of dlc id's associated with the game")
    controller_support: Optional[str] = Field(..., description="Type of controller support for the game, if available")
    about_the_game: Optional[str] = Field(...,description ="About the game")
    detailed_description: Optional[str] = Field(...,description ="game's description")
    short_description: Optional[str] = Field(...,description ="Short description of the game")
    supported_languages: Optional[str] = Field(...,description ="Languages the game supports")
    reviews: Optional[str] = Field(...,description ="Reviews or acclaim summary of the game")
    header_image: HttpUrl = Field(...,description ="Url to the header image of the game ")
    capsule_image: HttpUrl = Field(...,description ="Url to the thumbnail of the game")
    website: Optional[HttpUrl | str] = Field(...,description ="The games website ")
    requirements: Optional[Dict] = Field(..., description="PC system requirements for the game")
    developers: Optional[List[str]] = Field(default=[], description="List of developers")
    publishers: Optional[List[str]] = Field(..., description="List of publishers ")
    price_overview: Optional[Dict] = Field(..., description="Price overview of the game")
    platform: Optional[dict] = Field(..., description="Indicates the platforms it is available on ")
    metacritic: Optional[int] = Field(..., description="Metacritic score of the game if there")
    categories: Optional[list] = Field(default=[], description="Categories of the game")
    genres: Optional[list] = Field(default=[], description="Genres the game belongs to")
    recommendations: int = Field(..., description="Recommendations numbers")
    achievements_number: int = Field(..., description="Total number of attainable achievements")
    release_date: Optional[str] = Field(..., description="Date for when the game will release ")
    coming_soon: bool = Field(..., description="Indicates if the game release is upcoming")  
    
    @field_validator("required_age", mode="before")
    def validate_required_age(cls, v):
        if isinstance(v,str):
            match = re.search(r"\d+", v)
            if match:
                v = int(match.group())
            else:
                raise ValueError(f"Invalid value for required age {v}")
        elif not isinstance(v, int):
            raise ValueError(f"Invalid value for required age {v}")
        return v
    
    @field_validator("requirements", mode="before")
    def validate_requirements(cls, v):
        if isinstance(v,list):
            return {}
        else:
            return v 
        
class SteamGameMetadataList(BaseModel):
    game_metadata:List[SteamGameMetadata] = Field(...,description= "list of games")

In [9]:
STEAMSPY_BASE_URL: str = "https://steamspy.com/api.php"
STEAM_BASE_SEARCH_URL: str = "http://store.steampowered.com"
STEAM_TOP_GAMES:str = "https://api.steampowered.com/ISteamChartsService/GetMostPlayedGames/v1/"

In [10]:
class ApiClient:
    
    def __init__(self):
        self.session = requests.Session() #reuse TCP connections
        self.logger = logging.getLogger(__name__)
    
    def get_request(self, url:str,  parameters= None, max_retries = 5, wait_time =5, wait_time_multiplier = 4):
        """Send a GET request with retries and exponential backoff."""
        attempts = 0
        headers = {"User-Agent": "YourCustomUserAgent/1.0", "DNT": "1"}
        while attempts < max_retries:
            try: 
                response = requests.get(url=url, headers= headers, params = parameters)
                if response.status_code ==200:
                    return response.json()
                elif response.status_code == 429:#too many requests 
                    retry_after = response.headers.get("Retry-After", wait_time)
                    time.sleep(retry_after)
                else:
                    self.logger.error(f"Request failed with status {response.status_code}: {response.text}")
                    return None
                
            except Exception as e:
                self.logger.error(f"Request failed with status {e}")
            attempts+=1
            sleep_time = min(wait_time * (wait_time_multiplier ** (attempts - 1)), 60)  # Cap sleep at 60 sec
            self.logger.info(f"Retrying in {sleep_time} seconds...")
            time.sleep(sleep_time)

        return None
    

In [11]:
class SteamSpytop1002Weeks(ApiClient):
    def __init__(self):
        super().__init__()
        self.url = STEAMSPY_BASE_URL
        self.params = {'request':'top100in2weeks'}

    def run(self):
        
        data = self.get_request(self.url, self.params)
        if not data:
            self.logger.error("Failed to fetch data from SteamSpy")     
            
        return data.values()

In [12]:
client =SteamSpytop1002Weeks()
data = client.run()

In [13]:
data

dict_values([{'appid': 570, 'name': 'Dota 2', 'developer': 'Valve', 'publisher': 'Valve', 'score_rank': '', 'positive': 2003648, 'negative': 453057, 'userscore': 0, 'owners': '200,000,000 .. 500,000,000', 'average_forever': 62824, 'average_2weeks': 792, 'median_forever': 823, 'median_2weeks': 1162, 'price': '0', 'initialprice': '0', 'discount': '0', 'ccu': 351407}, {'appid': 730, 'name': 'Counter-Strike: Global Offensive', 'developer': 'Valve', 'publisher': 'Valve', 'score_rank': '', 'positive': 7513340, 'negative': 1141217, 'userscore': 0, 'owners': '100,000,000 .. 200,000,000', 'average_forever': 37924, 'average_2weeks': 479, 'median_forever': 7765, 'median_2weeks': 398, 'price': '0', 'initialprice': '0', 'discount': '0', 'ccu': 964715}, {'appid': 578080, 'name': 'PUBG: BATTLEGROUNDS', 'developer': 'PUBG Corporation', 'publisher': 'KRAFTON, Inc.', 'score_rank': '', 'positive': 1493677, 'negative': 1026537, 'userscore': 0, 'owners': '50,000,000 .. 100,000,000', 'average_forever': 1487

In [14]:
for d in data:
    print(d)

{'appid': 570, 'name': 'Dota 2', 'developer': 'Valve', 'publisher': 'Valve', 'score_rank': '', 'positive': 2003648, 'negative': 453057, 'userscore': 0, 'owners': '200,000,000 .. 500,000,000', 'average_forever': 62824, 'average_2weeks': 792, 'median_forever': 823, 'median_2weeks': 1162, 'price': '0', 'initialprice': '0', 'discount': '0', 'ccu': 351407}
{'appid': 730, 'name': 'Counter-Strike: Global Offensive', 'developer': 'Valve', 'publisher': 'Valve', 'score_rank': '', 'positive': 7513340, 'negative': 1141217, 'userscore': 0, 'owners': '100,000,000 .. 200,000,000', 'average_forever': 37924, 'average_2weeks': 479, 'median_forever': 7765, 'median_2weeks': 398, 'price': '0', 'initialprice': '0', 'discount': '0', 'ccu': 964715}
{'appid': 578080, 'name': 'PUBG: BATTLEGROUNDS', 'developer': 'PUBG Corporation', 'publisher': 'KRAFTON, Inc.', 'score_rank': '', 'positive': 1493677, 'negative': 1026537, 'userscore': 0, 'owners': '50,000,000 .. 100,000,000', 'average_forever': 14871, 'average_2we

In [15]:
games_metadata = GameMetaDataList(games_metadata=data)

In [16]:
app_ids = [g.appid for g in games_metadata.games_metadata]

In [17]:
class SteamSpyMetadataFetcher(ApiClient):
    
    def __init__(self, batch_size = 100, num_workers = 4 ):
        super().__init__()
        self.batch_size = batch_size
        self.url = STEAMSPY_BASE_URL
        self.num_workers = num_workers
    
    def fetch_metadata(self, app_id):
        """Fetch metadata for a single appid"""
        parameters = {'request':'appdetails', 'appid':app_id}
        data = self.get_request(self.url, parameters)
        if not data:
            self.logger.warning(f"Failed to fetchmetadata for {appid}")
            return None
        return data
    
    def process_batch(self, app_ids):
        """Fetch metadata for a batch of appIDS in parallel"""
        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
            results = list(executor.map(self.fetch_metadata, app_ids))
        return [result for result in results if result]
    def run(self, app_ids):
        all_data = []
        
        for i in range(0, len(app_ids), self.batch_size):
            batch = app_ids[i:i+self.batch_size]
            self.logger.info(f"Processing batch: {batch}")
            batch_data= self.process_batch(batch)
            if batch_data:
                all_data.extend(batch_data)
                
        return GameDetailsList(games=all_data)

In [18]:
client = SteamSpyMetadataFetcher()
data = client.run(app_ids)

In [19]:
data

appid=570 name='Dota 2' developer='Valve' publisher='Valve' score_rank='' positive=2003648 negative=453057 userscore=0.0 owners='200,000,000 .. 500,000,000' average_forever=62824 average_2weeks=792 median_forever=823 median_2weeks=1162 price=0 initialprice=0 discount='0' ccu=351407 languages='English, Bulgarian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese - Portugal, Portuguese - Brazil, Romanian, Russian, Simplified Chinese, Spanish - Spain, Swedish, Thai, Traditional Chinese, Turkish, Ukrainian, Spanish - Latin America, Vietnamese' genre='Action, Strategy, Free To Play' tags={'Free to Play': 59946, 'MOBA': 20165, 'Multiplayer': 15367, 'Strategy': 14253, 'e-sports': 11783, 'Team-Based': 10962, 'Competitive': 8287, 'Action': 7920, 'Online Co-Op': 7465, 'PvP': 6049, 'Difficult': 5349, 'Co-op': 4317, 'RTS': 4109, 'RPG': 3792, 'Tower Defense': 3777, 'Fantasy': 3750, 'Character Customization': 2931, 'Replay Value

In [76]:
class SteamStoreMetadata(ApiClient):
    
    def __init__(self, batch_size = 100, num_workers = 4 ):
        super().__init__()
        self.batch_size = batch_size
        self.url = STEAM_BASE_SEARCH_URL
        self.num_workers = num_workers
        
    def parser(self, text):
        if text:
            soup = BeautifulSoup(text, "lxml")
            plain_text = soup.get_text('\n', strip=True)
            return plain_text
        return None
    
    def process_steam_data(self, data:dict):
        try:
            data = {
                'type':  data['type'],
                'name':  data['name'],
                'appid': data['steam_appid'],
                'required_age': data['required_age'],
                'is_free':data['is_free'],
                'dlc': data.get('dlc',[]),
                'controller_support': data.get('controller_support',None),
                'about_the_game': self.parser(data.get('about_the_game','')),
                'detailed_description': self.parser(data.get('detailed_description','')),
                'short_description': self.parser(data.get('short_description','')),
                'supported_languages':self.parser(data.get('supported_languages','')),
                'reviews': self.parser(data.get('reviews','')),
                'header_image': data['header_image'],
                'capsule_image': data['capsule_image'],
                'website': data.get('website',''),
                'requirements': data['pc_requirements'],
                'developers': data.get('developers', None),
                'publishers': data.get('publishers',None),
                'price_overview': data.get('price_overview', None),
                'platform': data['platforms'],
                'metacritic': data.get('metacritic',{}).get('score',0),
                'categories':data.get('categories',None),
                'genres': data.get('genres',None),
                'recommendations': data.get("recommendations", {}).get("total", 0),
                'achievements_number':data.get("achievements", {}).get("total", 0),
                'release_date': data['release_date']['date'],
                'coming_soon': data['release_date']['coming_soon'], 
            }
            return SteamGameMetadata(**data) 
            
        except KeyError as ke:
            self.logger.error(f"The wrong key was not present {ke}")
        return None
    
    def fetch_metadata(self, app_id:int):
        """Fetch metadata for a single appid"""
        url = f"{self.url}/api/appdetails/"
        parameters = {'appids':app_id}
        data = self.get_request(url, parameters)

        if data:
            resp = data[f"{app_id}"]

            if resp['success']==True:
                data = resp['data']
                #process data 
                data = self.process_steam_data(data)
                if data and data.appid==app_id:
                    return data 
                else:
                    self.logger.warning(f"unsuccessful for pulling {app_id} data ")
                    return None
            
        self.logger.warning(f"Failed to fetch metadata for {app_id}")
        return None

    
    def process_batch(self, app_ids):
        """Fetch metadata for a batch of appIDS in parallel"""
        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
            results = list(executor.map(self.fetch_metadata, app_ids))
        return [result for result in results if result]
    
    def run(self, app_ids):
        all_data = []
        
        for i in range(0, len(app_ids), self.batch_size):
            batch = app_ids[i:i+self.batch_size]
            self.logger.info(f"Processing batch: {batch}")
            batch_data= self.process_batch(batch)
            if batch_data:
                all_data.extend(batch_data)
        return all_data

In [77]:
client = SteamStoreMetadata()
data = client.run(app_ids)

  soup = BeautifulSoup(text, "lxml")
unsuccessful for pulling 901583 data 
Failed to fetch metadata for 582010
unsuccessful for pulling 100 data 
Failed to fetch metadata for 377160


In [78]:
data[0]

SteamGameMetadata(type='game', name='Dota 2', appid=570, required_age=0, is_free=True, dlc=[1241930, 652720], controller_support=None, about_the_game="The most-played game on Steam.\nEvery day, millions of players worldwide enter battle as one of over a hundred Dota heroes. And no matter if it's their 10th hour of play or 1,000th, there's always something new to discover. With regular updates that ensure a constant evolution of gameplay, features, and heroes, Dota 2 has truly taken on a life of its own.\nOne Battlefield. Infinite Possibilities.\nWhen it comes to diversity of heroes, abilities, and powerful items, Dota boasts an endless array—no two games are the same. Any hero can fill multiple roles, and there's an abundance of items to help meet the needs of each game. Dota doesn't provide limitations on how to play, it empowers you to express your own style.\nAll heroes are free.\nCompetitive balance is Dota's crown jewel, and to ensure everyone is playing on an even field, the core

In [74]:
client = SteamStoreMetadata()
d = client.fetch_metadata(100)

unsuccessful for pulling 100 data 


type='game' name='Counter-Strike: Condition Zero' appid=80 required_age=0 is_free=False dlc=[] controller_support=None about_the_game="With its extensive Tour of Duty campaign, a near-limitless number of skirmish modes, updates and new content for Counter-Strike's award-winning multiplayer game play, plus over 12 bonus single player missions, Counter-Strike: Condition Zero is a tremendous offering of single and multiplayer content." detailed_description="With its extensive Tour of Duty campaign, a near-limitless number of skirmish modes, updates and new content for Counter-Strike's award-winning multiplayer game play, plus over 12 bonus single player missions, Counter-Strike: Condition Zero is a tremendous offering of single and multiplayer content." short_description="With its extensive Tour of Duty campaign, a near-limitless number of skirmish modes, updates and new content for Counter-Strike's award-winning multiplayer game play, plus over 12 bonus single player missions, Counter-St

In [54]:
d.developers

['Mediatonic']

In [55]:
class SteamTop100(ApiClient):
    def __init__(self):
        super().__init__()
        self.url = STEAM_TOP_GAMES


    def run(self):
        
        data = self.get_request(self.url)
        if not data:
            self.logger.error("Failed to fetch data from SteamSpy")     
            
        return data.values()

In [56]:
client= SteamTop100()
data = client.run()

In [59]:
for d in data:
    print(d['rollup_date'])

1742428800


In [62]:
def get_utc_timestamp(timestamp):
    return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d')

class SteamGameRank(BaseModel):
    appid: int = Field(...,description = "Steam Application ID")
    rank: int = Field(...,description ="game's current rank")
    last_week_rank: int = Field(...,description ="game's rank last week")
    peak_in_game: int = Field(...,description ="peak number of players in the last 24 hours's")
    scrape_data: datetime = Field(
        ..., description="Date of the scrape"
    )
    dne:Optional[bool] = Field(False, description="Flag that the app id doesn't exist in the database")
        
class GameTop100Rank(BaseModel):
    ranks: List[SteamGameRank] = Field(...,description= "list of games metadata")
    
    @classmethod
    def from_raw_data(cls, raw_data):
        """Parse raw data for pydantic model"""
        for data in raw_data:
            scrape_data = get_utc_timestamp(data["rollup_date"])
            ranks = [
                SteamGameRank(**game, scrape_data=scrape_data) for game in data['ranks']
            ]
            break
        return cls(ranks=ranks)

In [64]:
top100 = GameTop100Rank.from_raw_data(data)