## Install Scrapy via pip


In [1]:
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.ma

## Create directory

In [3]:
import os

# Creates a directory if it doesn’t already exist and navigates into it.
project_name = "my_scrapy_project"
os.makedirs(project_name, exist_ok=True)
os.chdir(project_name)

## Start project

In [4]:
!scrapy startproject gamalytic

New Scrapy project 'gamalytic', using template directory '/usr/local/lib/python3.11/dist-packages/scrapy/templates/project', created in:
    /content/my_scrapy_project/gamalytic

You can start your first spider with:
    cd gamalytic
    scrapy genspider example example.com


## Create spider


In [5]:
!scrapy genspider gamalytic_spider gamalytic.com

Created spider 'gamalytic_spider' using template 'basic' 


## Code for spider that scrapes data with mane features from Gamalytic website

In [6]:
%%writefile /content/my_scrapy_project/gamalytic/gamalytic/spiders/gamalytic_spider.py
import scrapy
import json
from datetime import datetime

class GamalyticAPISpider(scrapy.Spider):
    name = 'gamalytic_spider'
    base_url = 'https://api.gamalytic.com/steam-games/list?fields=name%2CfirstReleaseDate%2CearlyAccessExitDate%2CearlyAccess%2CcopiesSold%2Cprice%2Crevenue%2CavgPlaytime%2CreviewScore%2CpublisherClass%2Cpublishers%2Cdevelopers%2Cid%2CsteamId&limit=100'  # Увеличиваем лимит до 100
    start_urls = [base_url]
    max_items = 1000
    item_count = 0

    custom_settings = {
        'FEED_EXPORT_INDENT': 4,
        'FEED_FORMAT': 'json',
        'FEED_URI': 'games.json',
    }

    def format_unix_time(self, unix_time):
        if unix_time:
            return datetime.fromtimestamp(unix_time / 1000).strftime('%d %b, %Y')
        return None

    def parse(self, response):
        try:

            data = json.loads(response.text)

            if isinstance(data, dict) and 'result' in data:
                items = data['result']
                for item in items:
                    if self.item_count >= self.max_items:
                        return

                    game_data = {
                        'name': item.get('name'),
                        'first_release_date': self.format_unix_time(item.get('firstReleaseDate')),
                        'early_access_exit_date': self.format_unix_time(item.get('earlyAccessExitDate')),
                        'early_access': item.get('earlyAccess'),
                        'copies_sold': item.get('copiesSold'),
                        'price': item.get('price'),
                        'revenue': item.get('revenue'),
                        'avg_playtime': item.get('avgPlaytime'),
                        'review_score': item.get('reviewScore'),
                        'publisher_class': item.get('publisherClass'),
                        'publishers': item.get('publishers'),
                        'developers': item.get('developers'),
                        'id': item.get('id'),
                        'steam_id': item.get('steamId'),
                    }

                    game_page_url = f'https://api.gamalytic.com/game/{item.get("steamId")}?include_pre_release_history=true'
                    yield scrapy.Request(
                        url=game_page_url,
                        callback=self.parse_game_page,
                        meta={'game_data': game_data}
                    )

                    self.item_count += 1

                if self.item_count < self.max_items and 'next' in data:
                    next_page = data['next'].get('page')
                    next_url = f'{self.base_url}&page={next_page}'
                    yield scrapy.Request(url=next_url, callback=self.parse)
            else:
                self.logger.error(f"Ожидался словарь с ключом 'result', но получен: {type(data)}")
        except json.JSONDecodeError as e:
            self.logger.error(f"Ошибка при декодировании JSON: {e}")

    def parse_game_page(self, response):
        try:
            game_page_data = json.loads(response.text)
            game_data = response.meta['game_data']

            game_data.update({
                'tags': game_page_data.get('tags'),
                'genres': game_page_data.get('genres'),
                'features': game_page_data.get('features'),
                'languages': game_page_data.get('languages'),
                'countryData': game_page_data.get('countryData'),
                'audienceOverlap': game_page_data.get('audienceOverlap'),
                'playtimeData': game_page_data.get('playtimeData'),
                'totalRevenue': game_page_data.get('totalRevenue'),
                'players': game_page_data.get('players'),
                'owners': game_page_data.get('owners'),
                'steamPercent': game_page_data.get('steamPercent'),
                'accuracy': game_page_data.get('accuracy'),
                'estimateDetails': game_page_data.get('estimateDetails'),
                'wishlists': game_page_data.get('wishlists'),
                'dlc_count': len(game_page_data.get('dlc', [])),
            })

            yield game_data
        except json.JSONDecodeError as e:
            self.logger.error(f"Error with decoding JSON: {e}")


Writing /content/my_scrapy_project/gamalytic/gamalytic/spiders/gamalytic_spider.py


## Run spider and store data in games.json

In [7]:
!scrapy runspider /content/my_scrapy_project/gamalytic/gamalytic/spiders/gamalytic_spider.py -o games.json

2025-03-17 12:09:34 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-03-17 12:09:34 [scrapy.utils.log] INFO: Versions: lxml 5.3.1.0, libxml2 2.12.9, cssselect 1.3.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2025-03-17 12:09:34 [scrapy.addons] INFO: Enabled addons:
[]
2025-03-17 12:09:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2025-03-17 12:09:34 [scrapy.extensions.telnet] INFO: Telnet Password: 401d47cef57dacfb
  exporter = cls(crawler)
2025-03-17 12:09:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2025-03-17 12:09:34 [scrapy.crawler] INFO

## Create another spider to store data about more amount of games, but with less features

In [9]:
%%writefile /content/my_scrapy_project/gamalytic/gamalytic/spiders/gamalytic_spider.py
import scrapy
import json
from datetime import datetime

class GamalyticAPISpider(scrapy.Spider):
    name = 'gamalytic_spider'
    base_url = 'https://api.gamalytic.com/steam-games/list?fields=name%2CfirstReleaseDate%2CearlyAccessExitDate%2CearlyAccess%2CcopiesSold%2Cprice%2Crevenue%2CavgPlaytime%2CreviewScore%2CpublisherClass%2Cpublishers%2Cdevelopers%2Cid%2CsteamId&limit=100'  # Увеличиваем лимит до 100
    start_urls = [base_url]
    max_items = 5709
    item_count = 0

    custom_settings = {
        'FEED_EXPORT_INDENT': 4,
        'FEED_FORMAT': 'json',
        'FEED_URI': 'games.json'
    }

    def format_unix_time(self, unix_time):
        if unix_time:
            return datetime.fromtimestamp(unix_time / 1000).strftime('%d %b, %Y')
        return None

    def parse(self, response):
        try:

            data = json.loads(response.text)

            if isinstance(data, dict) and 'result' in data:
                items = data['result']
                for item in items:
                    if self.item_count >= self.max_items:
                        return

                    yield {
                        'name': item.get('name'),
                        'first_release_date': self.format_unix_time(item.get('firstReleaseDate')),
                        'early_access_exit_date': self.format_unix_time(item.get('earlyAccessExitDate')),
                        'early_access': item.get('earlyAccess'),
                        'copies_sold': item.get('copiesSold'),
                        'price': item.get('price'),
                        'revenue': item.get('revenue'),
                        'avg_playtime': item.get('avgPlaytime'),
                        'review_score': item.get('reviewScore'),
                        'publisher_class': item.get('publisherClass'),
                        'publishers': item.get('publishers'),
                        'developers': item.get('developers'),
                        'id': item.get('id'),
                        'steam_id': item.get('steamId'),
                    }

                    self.item_count += 1

                if self.item_count < self.max_items and 'next' in data:
                    next_page = data['next'].get('page')
                    next_url = f'{self.base_url}&page={next_page}'
                    yield scrapy.Request(url=next_url, callback=self.parse)
            else:
                self.logger.error(f"Was waiting for dictionary with key 'result', but got: {type(data)}")
        except json.JSONDecodeError as e:
            self.logger.error(f"Eror with decoding JSON: {e}")

Overwriting /content/my_scrapy_project/gamalytic/gamalytic/spiders/gamalytic_spider.py


## Run spider and store data in games_large.json

In [10]:
!scrapy runspider /content/my_scrapy_project/gamalytic/gamalytic/spiders/gamalytic_spider.py -o games_large.json

2025-03-17 12:10:46 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-03-17 12:10:46 [scrapy.utils.log] INFO: Versions: lxml 5.3.1.0, libxml2 2.12.9, cssselect 1.3.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2025-03-17 12:10:46 [scrapy.addons] INFO: Enabled addons:
[]
2025-03-17 12:10:46 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2025-03-17 12:10:46 [scrapy.extensions.telnet] INFO: Telnet Password: 7d3c8516a352f3e1
  exporter = cls(crawler)
2025-03-17 12:10:46 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2025-03-17 12:10:46 [scrapy.crawler] INFO