From 011053eb0bb9167bdd29b19c03d2c51725074dfb Mon Sep 17 00:00:00 2001 From: Jakub Kuczys Date: Sat, 10 Jun 2023 22:21:43 +0200 Subject: [PATCH 1/3] Store internal metadata with created_at and last_updated_at info --- indexer.py | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 157 insertions(+), 9 deletions(-) diff --git a/indexer.py b/indexer.py index dec1cd12..bba4be10 100644 --- a/indexer.py +++ b/indexer.py @@ -1,19 +1,23 @@ +import datetime +import hmac import json import yaml import re import sys from gzip import GzipFile from pathlib import Path -from hashlib import sha1 +import hashlib CACHE = Path("cache") RX_PROTOCOL = 1 # This should be incremented when breaking changes to the format are implemented -GEN_PATH = Path("index") +GEN_PATH = Path("index") # exposed Index endpoints GEN_FILE = GEN_PATH / Path(f"{RX_PROTOCOL}.json") # Pretty, for QA checking GEN_MIN_FILE = GEN_PATH / Path(f"{RX_PROTOCOL}-min.json") # Minified, for user download GEN_GZ_FILE = GEN_PATH / Path(f"{RX_PROTOCOL}-min.json.gz") # Gzipped GEN_ERROR_LOG = GEN_PATH / Path(f"{RX_PROTOCOL}-errors.yaml") # Error log +METADATA_FILE = Path("metadata.json") # internal metadata, used for e.g. last_updated_at dates +NOW = datetime.datetime.now(datetime.timezone.utc) class CustomEncoder(json.JSONEncoder): @@ -24,7 +28,7 @@ def default(self, obj): return json.JSONEncoder.default(self, obj) class Repo: - def __init__(self, url: str, category: str): + def __init__(self, metadata, category: str): """Anything exposed here will be serialized later Attributes starting with rx_ deviate from the info.json spec @@ -37,11 +41,12 @@ def __init__(self, url: str, category: str): self.author = [] self.description = "" self.short = "" - self._url = url + self._metadata = metadata + self._url = metadata.url self.name = "" self.rx_branch = "" try: - self.parse_name_branch_url(url) + self.parse_name_branch_url(metadata.url) except: self._error = ("Something went wrong while parsing the url. " "Is it a valid address?") @@ -111,7 +116,7 @@ def process_cogs(self): return for cog in self.rx_cogs: - cog.get_info() + cog.get_info(self._metadata) cog.check_cog_validity() def __json__(self): @@ -139,6 +144,8 @@ def __init__(self, name: str, path: Path): self.requirements = [] self.tags = [] self.type = "" # Still a thing? + self.rx_added_at = "" + self.rx_last_updated_at = "" self._error = "" def check_cog_validity(self): @@ -149,7 +156,7 @@ def check_cog_validity(self): if not initpath.exists(): self._error = "Info.json is present but no __init__.py was found. Invalid cog package." - def get_info(self): + def get_info(self, repo_metadata): if self._error: return info_path = self._path / Path("info.json") @@ -175,12 +182,130 @@ def get_info(self): self.requirements = data.get("requirements", []) self.tags = data.get("tags", []) self.type = data.get("type", "") + if self._name in repo_metadata.cogs: + cog_metadata = repo_metadata.cogs[self._name] + cog_metadata.update_from_path(self._path) + else: + cog_metadata = InternalCogMetadata.from_path(self._name, self._path) + repo_metadata.cogs[self._name] = cog_metadata + self.rx_added_at = cog_metadata.added_at.isoformat() + self.rx_last_updated_at = cog_metadata.last_updated_at.isoformat() def __json__(self): return {k:v for (k, v) in self.__dict__.items() if not k.startswith("_") and not callable(k)} +class InternalRepoMetadata: + def __init__(self, url, cogs=None): + self.url = url + self.cogs = cogs or {} + + @classmethod + def from_dict(cls, url, data): + cogs = { + name: InternalCogMetadata.from_dict(name, cog_metadata) + for name, cog_metadata in data["cogs"].items() + } + return cls(url, cogs) + + def __json__(self): + return { + "cogs": self.cogs, + } + +class InternalCogMetadata: + _BUFFER_SIZE = 2**18 + _PREFERRED_ALGORITHMS = ("sha256",) + + def __init__(self, name, *, added_at, last_updated_at, deleted_at, hashes): + self.name = name + self.added_at = added_at + self.last_updated_at = last_updated_at + self.deleted_at = deleted_at + self.hashes = hashes + self._still_exists = False + + @classmethod + def from_dict(cls, name, data): + return cls( + name=name, + added_at=get_datetime(data["added_at"]), + last_updated_at=get_datetime(data["last_updated_at"]), + deleted_at=get_datetime(data["deleted_at"]), + hashes=data["hashes"], + ) + + @classmethod + def from_path(cls, name, path): + obj = cls( + name=name, + added_at=NOW, + last_updated_at=NOW, + deleted_at=None, + hashes=cls.get_file_hashes(path), + ) + obj._still_exists = True + return obj + + def __json__(self): + return { + "added_at": self.added_at.timestamp(), + "last_updated_at": self.last_updated_at.timestamp(), + "deleted_at": self.deleted_at and self.deleted_at.timestamp(), + "hashes": self.hashes, + } + + def update_from_path(self, path): + self._still_exists = True + self.deleted_at = None + hashes = self.get_file_hashes(path) + if not self.verify_hashes(hashes): + self.last_updated_at = NOW + + @classmethod + def get_file_hashes(cls, path): + buffer = bytearray(cls._BUFFER_SIZE) + view = memoryview(buffer) + digests = {algorithm: hashlib.new(algorithm) for algorithm in ("sha256",)} + for path in sorted(path.rglob("**/*")): + if not path.is_file(): + continue + with path.open("rb") as fp: + while True: + size = fp.readinto(buffer) + if not size: + break + for digestobj in digests.values(): + digestobj.update(view[:size]) + return {algorithm: digestobj.hexdigest() for algorithm, digestobj in digests.items()} + + def verify_hashes(self, hashes): + for algorithm in self._PREFERRED_ALGORITHMS: + try: + a = self.hashes[algorithm] + b = hashes[algorithm] + except KeyError: + continue + else: + return hmac.compare_digest(a, b) + + for algorithm in self.hashes.keys() & hashes.keys(): + try: + a = self.hashes[algorithm] + b = hashes[algorithm] + except KeyError: + continue + else: + return hmac.compare_digest(a, b) + + raise RuntimeError("No matching hashes were found.") + +def get_datetime(timestamp: int = None): + if timestamp is None: + return None + return datetime.datetime.fromtimestamp(timestamp).astimezone(datetime.timezone.utc) + def sha1_digest(url): - return sha1(url.encode('utf-8')).hexdigest() + return hashlib.sha1(url.encode('utf-8')).hexdigest() def make_error_log(repos): log = {} @@ -207,12 +332,27 @@ def main(): with open(yamlfile) as f: data = yaml.safe_load(f.read()) + try: + with open(METADATA_FILE, "r") as fp: + raw_metadata = json.load(fp) + except FileNotFoundError: + metadata = {} + else: + metadata = { + url: InternalRepoMetadata.from_dict(url, repo_metadata) + for url, repo_metadata in raw_metadata.items() + } repos = [] for k in ("approved", "unapproved"): if data[k]: # Can be None if empty for url in data[k]: - repos.append(Repo(url, k)) + if url in metadata: + repo_metadata = metadata[url] + else: + repo_metadata = InternalRepoMetadata(url) + metadata[url] = repo_metadata + repos.append(Repo(repo_metadata, k)) for r in repos: r.folder_check_and_get_info() @@ -226,6 +366,14 @@ def main(): for r in repos: r.rx_cogs = [c for c in r.rx_cogs if not c._error] + for repo_metadata in metadata.values(): + for cog_metadata in repo_metadata.cogs.values(): + if not cog_metadata._still_exists: + cog_metadata.deleted_at = NOW + + with open(METADATA_FILE, "w") as fp: + json.dump(metadata, fp, indent=4, sort_keys=True, cls=CustomEncoder) + if data["flagged-cogs"]: for url, flagged_cogs in data["flagged-cogs"].items(): for r in repos: From 97e120c483544c873b51c021b00514c27f73a42f Mon Sep 17 00:00:00 2001 From: Jakub Kuczys Date: Sun, 11 Jun 2023 00:26:08 +0200 Subject: [PATCH 2/3] Fix cog's deleted_at always being overriden --- indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexer.py b/indexer.py index bba4be10..1c8f41e7 100644 --- a/indexer.py +++ b/indexer.py @@ -369,7 +369,7 @@ def main(): for repo_metadata in metadata.values(): for cog_metadata in repo_metadata.cogs.values(): if not cog_metadata._still_exists: - cog_metadata.deleted_at = NOW + cog_metadata.deleted_at = cog_metadata.deleted_at or NOW with open(METADATA_FILE, "w") as fp: json.dump(metadata, fp, indent=4, sort_keys=True, cls=CustomEncoder) From 5a36082595b34d40ee9f3ee662f0df0581e5ce1f Mon Sep 17 00:00:00 2001 From: Jakub Kuczys Date: Sun, 11 Jun 2023 00:30:48 +0200 Subject: [PATCH 3/3] Added added_at and approved_at fields to repos --- indexer.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/indexer.py b/indexer.py index 1c8f41e7..b30636fa 100644 --- a/indexer.py +++ b/indexer.py @@ -45,6 +45,8 @@ def __init__(self, metadata, category: str): self._url = metadata.url self.name = "" self.rx_branch = "" + self.rx_added_at = "" + self.rx_approved_at = "" try: self.parse_name_branch_url(metadata.url) except: @@ -97,6 +99,11 @@ def folder_check_and_get_info(self): self.author = info.get("author", []) self.description = info.get("description", "") self.short = info.get("short", "") + self._metadata._still_exists = True + if self.rx_category == "approved" and self._metadata.approved_at is None: + self._metadata.approved_at = NOW + self.rx_added_at = self._metadata.added_at.isoformat() + self.rx_approved_at = self._metadata.approved_at and self._metadata.approved_at.isoformat() def populate_cogs(self): if self._error: @@ -195,9 +202,22 @@ def __json__(self): return {k:v for (k, v) in self.__dict__.items() if not k.startswith("_") and not callable(k)} class InternalRepoMetadata: - def __init__(self, url, cogs=None): + def __init__(self, url, cogs=None, *, added_at=None, approved_at=None, deleted_at=None): self.url = url self.cogs = cogs or {} + self.added_at = added_at or NOW + self.approved_at = approved_at + self.deleted_at = deleted_at + self.__still_exists = False + + @property + def _still_exists(self): + return self.__still_exists + + @_still_exists.setter + def _still_exists(self, value): + self.__still_exists = value + self.deleted_at = None @classmethod def from_dict(cls, url, data): @@ -205,11 +225,23 @@ def from_dict(cls, url, data): name: InternalCogMetadata.from_dict(name, cog_metadata) for name, cog_metadata in data["cogs"].items() } - return cls(url, cogs) + added_at = get_datetime(data["added_at"]) + approved_at = get_datetime(data["approved_at"]) + deleted_at = get_datetime(data["deleted_at"]) + return cls( + url, + cogs, + added_at=added_at, + approved_at=approved_at, + deleted_at=deleted_at, + ) def __json__(self): return { "cogs": self.cogs, + "added_at": self.added_at.timestamp(), + "approved_at": self.approved_at and self.approved_at.timestamp(), + "deleted_at": self.deleted_at and self.deleted_at.timestamp(), } class InternalCogMetadata: @@ -367,6 +399,8 @@ def main(): r.rx_cogs = [c for c in r.rx_cogs if not c._error] for repo_metadata in metadata.values(): + if not repo_metadata._still_exists: + repo_metadata.deleted_at = repo_metadata.deleted_at or NOW for cog_metadata in repo_metadata.cogs.values(): if not cog_metadata._still_exists: cog_metadata.deleted_at = cog_metadata.deleted_at or NOW