In [1]:
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manyli

In [2]:
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urljoin
import json


class GithubRepoSpider(scrapy.Spider):
    name = "github_repos"
    start_urls = ["https://github.com/Tsogbat?tab=repositories"]

    custom_settings = {
        'FEED_URI': 'github_repos.xml',
        'FEED_FORMAT': 'xml',
        'ROBOTSTXT_OBEY': False,
        'USER_AGENT': 'Mozilla/5.0'
    }

    def parse(self, response):
        repos = response.css('h3.wb-break-all a::attr(href)').getall()
        for repo_link in repos:
            full_url = urljoin(response.url, repo_link.strip())
            repo_owner, repo_name = full_url.strip('/').split('/')[-2:]
            api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}"
            languages_api_url = api_url + "/languages"
            commits_api_url = api_url + "/commits"

            yield scrapy.Request(
                url=api_url,
                callback=self.parse_api_repo,
                meta={
                    'repo_url': full_url,
                    'repo_name': repo_name,
                    'languages_api_url': languages_api_url,
                    'commits_api_url': commits_api_url
                }
            )

    def parse_api_repo(self, response):
        repo_url = response.meta['repo_url']
        repo_name = response.meta['repo_name']
        languages_api_url = response.meta['languages_api_url']
        commits_api_url = response.meta['commits_api_url']

        data = json.loads(response.text)

        about = data.get("description")
        is_empty = data.get("size", 0) == 0
        if not about:
            about = None if is_empty else repo_name

        last_updated = data.get("pushed_at")

        if not is_empty:
            yield scrapy.Request(
                url=languages_api_url,
                callback=self.parse_languages,
                meta={
                    'repo_url': repo_url,
                    'about': about,
                    'last_updated': last_updated,
                    'commits_api_url': commits_api_url
                }
            )
        else:
            yield {
                'url': repo_url,
                'about': about,
                'last_updated': last_updated,
                'languages': None,
                'number_of_commits': None
            }

    def parse_languages(self, response):
        repo_url = response.meta['repo_url']
        about = response.meta['about']
        last_updated = response.meta['last_updated']
        commits_api_url = response.meta['commits_api_url']

        lang_data = json.loads(response.text)
        total = sum(lang_data.values())
        languages = [
            f"{lang} ({round((count / total) * 100, 2)}%)"
            for lang, count in lang_data.items()
        ] if total else None

        yield scrapy.Request(
            url=commits_api_url,
            callback=self.parse_commits,
            meta={
                'repo_url': repo_url,
                'about': about,
                'last_updated': last_updated,
                'languages': languages
            }
        )

    def parse_commits(self, response):
        repo_url = response.meta['repo_url']
        about = response.meta['about']
        last_updated = response.meta['last_updated']
        languages = response.meta['languages']

        commits_count = None
        link_header = response.headers.get('Link')
        if link_header:
            link_header = link_header.decode()
            if 'last' in link_header:
                import re
                last_page = re.findall(r'&page=(\d+)>; rel="last"', link_header)
                if last_page:
                    commits_count = int(last_page[0])
        else:
            commits_count = len(json.loads(response.text))

        yield {
            'url': repo_url,
            'about': about,
            'last_updated': last_updated,
            'languages': languages,
            'number_of_commits': commits_count
        }


process = CrawlerProcess()
process.crawl(GithubRepoSpider)
process.start()


INFO:scrapy.utils.log:Scrapy 2.12.0 started (bot: scrapybot)
2025-04-13 15:17:25 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 5.3.1.0, libxml2 2.12.9, cssselect 1.3.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2025-04-13 15:17:25 [scrapy.utils.log] INFO: Versions: lxml 5.3.1.0, libxml2 2.12.9, cssselect 1.3.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform Linux-6.1.85+-x86_64-with-glibc2.35
INFO:scrapy.addons:Enabled addons:
[]
2025-04-13 15:17:25 [scrapy.addons] INFO: Enabled addons:
[]
DEBUG:scrapy.utils.log:Using reactor: twisted.internet.epollreactor.EPollReactor
2025-04-13 15:17:25 [scrapy.utils.log] DEBUG: Using reactor: twi