Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store internal metadata with created_at and last_updated_at info #93

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 191 additions & 9 deletions indexer.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import datetime
import hmac
import json
import yaml
import re
import sys
from gzip import GzipFile
from pathlib import Path
from hashlib import sha1
import hashlib

CACHE = Path("cache")

RX_PROTOCOL = 1 # This should be incremented when breaking changes to the format are implemented
GEN_PATH = Path("index")
GEN_PATH = Path("index") # exposed Index endpoints
GEN_FILE = GEN_PATH / Path(f"{RX_PROTOCOL}.json") # Pretty, for QA checking
GEN_MIN_FILE = GEN_PATH / Path(f"{RX_PROTOCOL}-min.json") # Minified, for user download
GEN_GZ_FILE = GEN_PATH / Path(f"{RX_PROTOCOL}-min.json.gz") # Gzipped
GEN_ERROR_LOG = GEN_PATH / Path(f"{RX_PROTOCOL}-errors.yaml") # Error log
METADATA_FILE = Path("metadata.json") # internal metadata, used for e.g. last_updated_at dates
NOW = datetime.datetime.now(datetime.timezone.utc)


class CustomEncoder(json.JSONEncoder):
Expand All @@ -24,7 +28,7 @@ def default(self, obj):
return json.JSONEncoder.default(self, obj)

class Repo:
def __init__(self, url: str, category: str):
def __init__(self, metadata, category: str):
"""Anything exposed here will be serialized later

Attributes starting with rx_ deviate from the info.json spec
Expand All @@ -37,11 +41,14 @@ def __init__(self, url: str, category: str):
self.author = []
self.description = ""
self.short = ""
self._url = url
self._metadata = metadata
self._url = metadata.url
self.name = ""
self.rx_branch = ""
self.rx_added_at = ""
self.rx_approved_at = ""
try:
self.parse_name_branch_url(url)
self.parse_name_branch_url(metadata.url)
except:
self._error = ("Something went wrong while parsing the url. "
"Is it a valid address?")
Expand Down Expand Up @@ -92,6 +99,11 @@ def folder_check_and_get_info(self):
self.author = info.get("author", [])
self.description = info.get("description", "")
self.short = info.get("short", "")
self._metadata._still_exists = True
if self.rx_category == "approved" and self._metadata.approved_at is None:
self._metadata.approved_at = NOW
self.rx_added_at = self._metadata.added_at.isoformat()
self.rx_approved_at = self._metadata.approved_at and self._metadata.approved_at.isoformat()

def populate_cogs(self):
if self._error:
Expand All @@ -111,7 +123,7 @@ def process_cogs(self):
return

for cog in self.rx_cogs:
cog.get_info()
cog.get_info(self._metadata)
cog.check_cog_validity()

def __json__(self):
Expand Down Expand Up @@ -139,6 +151,8 @@ def __init__(self, name: str, path: Path):
self.requirements = []
self.tags = []
self.type = "" # Still a thing?
self.rx_added_at = ""
self.rx_last_updated_at = ""
self._error = ""

def check_cog_validity(self):
Expand All @@ -149,7 +163,7 @@ def check_cog_validity(self):
if not initpath.exists():
self._error = "Info.json is present but no __init__.py was found. Invalid cog package."

def get_info(self):
def get_info(self, repo_metadata):
if self._error:
return
info_path = self._path / Path("info.json")
Expand All @@ -175,12 +189,155 @@ def get_info(self):
self.requirements = data.get("requirements", [])
self.tags = data.get("tags", [])
self.type = data.get("type", "")
if self._name in repo_metadata.cogs:
cog_metadata = repo_metadata.cogs[self._name]
cog_metadata.update_from_path(self._path)
else:
cog_metadata = InternalCogMetadata.from_path(self._name, self._path)
repo_metadata.cogs[self._name] = cog_metadata
self.rx_added_at = cog_metadata.added_at.isoformat()
self.rx_last_updated_at = cog_metadata.last_updated_at.isoformat()

def __json__(self):
return {k:v for (k, v) in self.__dict__.items() if not k.startswith("_") and not callable(k)}

class InternalRepoMetadata:
def __init__(self, url, cogs=None, *, added_at=None, approved_at=None, deleted_at=None):
self.url = url
self.cogs = cogs or {}
self.added_at = added_at or NOW
self.approved_at = approved_at
self.deleted_at = deleted_at
self.__still_exists = False

@property
def _still_exists(self):
return self.__still_exists

@_still_exists.setter
def _still_exists(self, value):
self.__still_exists = value
self.deleted_at = None

@classmethod
def from_dict(cls, url, data):
cogs = {
name: InternalCogMetadata.from_dict(name, cog_metadata)
for name, cog_metadata in data["cogs"].items()
}
added_at = get_datetime(data["added_at"])
approved_at = get_datetime(data["approved_at"])
deleted_at = get_datetime(data["deleted_at"])
return cls(
url,
cogs,
added_at=added_at,
approved_at=approved_at,
deleted_at=deleted_at,
)

def __json__(self):
return {
"cogs": self.cogs,
"added_at": self.added_at.timestamp(),
"approved_at": self.approved_at and self.approved_at.timestamp(),
"deleted_at": self.deleted_at and self.deleted_at.timestamp(),
}

class InternalCogMetadata:
_BUFFER_SIZE = 2**18
_PREFERRED_ALGORITHMS = ("sha256",)

def __init__(self, name, *, added_at, last_updated_at, deleted_at, hashes):
self.name = name
self.added_at = added_at
self.last_updated_at = last_updated_at
self.deleted_at = deleted_at
self.hashes = hashes
self._still_exists = False

@classmethod
def from_dict(cls, name, data):
return cls(
name=name,
added_at=get_datetime(data["added_at"]),
last_updated_at=get_datetime(data["last_updated_at"]),
deleted_at=get_datetime(data["deleted_at"]),
hashes=data["hashes"],
)

@classmethod
def from_path(cls, name, path):
obj = cls(
name=name,
added_at=NOW,
last_updated_at=NOW,
deleted_at=None,
hashes=cls.get_file_hashes(path),
)
obj._still_exists = True
return obj

def __json__(self):
return {
"added_at": self.added_at.timestamp(),
"last_updated_at": self.last_updated_at.timestamp(),
"deleted_at": self.deleted_at and self.deleted_at.timestamp(),
"hashes": self.hashes,
}

def update_from_path(self, path):
self._still_exists = True
self.deleted_at = None
hashes = self.get_file_hashes(path)
if not self.verify_hashes(hashes):
self.last_updated_at = NOW

@classmethod
def get_file_hashes(cls, path):
buffer = bytearray(cls._BUFFER_SIZE)
view = memoryview(buffer)
digests = {algorithm: hashlib.new(algorithm) for algorithm in ("sha256",)}
for path in sorted(path.rglob("**/*")):
if not path.is_file():
continue
with path.open("rb") as fp:
while True:
size = fp.readinto(buffer)
if not size:
break
for digestobj in digests.values():
digestobj.update(view[:size])
return {algorithm: digestobj.hexdigest() for algorithm, digestobj in digests.items()}

def verify_hashes(self, hashes):
for algorithm in self._PREFERRED_ALGORITHMS:
try:
a = self.hashes[algorithm]
b = hashes[algorithm]
except KeyError:
continue
else:
return hmac.compare_digest(a, b)

for algorithm in self.hashes.keys() & hashes.keys():
try:
a = self.hashes[algorithm]
b = hashes[algorithm]
except KeyError:
continue
else:
return hmac.compare_digest(a, b)

raise RuntimeError("No matching hashes were found.")

def get_datetime(timestamp: int = None):
if timestamp is None:
return None
return datetime.datetime.fromtimestamp(timestamp).astimezone(datetime.timezone.utc)

def sha1_digest(url):
return sha1(url.encode('utf-8')).hexdigest()
return hashlib.sha1(url.encode('utf-8')).hexdigest()

def make_error_log(repos):
log = {}
Expand All @@ -207,12 +364,27 @@ def main():
with open(yamlfile) as f:
data = yaml.safe_load(f.read())

try:
with open(METADATA_FILE, "r") as fp:
raw_metadata = json.load(fp)
except FileNotFoundError:
metadata = {}
else:
metadata = {
url: InternalRepoMetadata.from_dict(url, repo_metadata)
for url, repo_metadata in raw_metadata.items()
}
repos = []

for k in ("approved", "unapproved"):
if data[k]: # Can be None if empty
for url in data[k]:
repos.append(Repo(url, k))
if url in metadata:
repo_metadata = metadata[url]
else:
repo_metadata = InternalRepoMetadata(url)
metadata[url] = repo_metadata
repos.append(Repo(repo_metadata, k))

for r in repos:
r.folder_check_and_get_info()
Expand All @@ -226,6 +398,16 @@ def main():
for r in repos:
r.rx_cogs = [c for c in r.rx_cogs if not c._error]

for repo_metadata in metadata.values():
if not repo_metadata._still_exists:
repo_metadata.deleted_at = repo_metadata.deleted_at or NOW
for cog_metadata in repo_metadata.cogs.values():
if not cog_metadata._still_exists:
cog_metadata.deleted_at = cog_metadata.deleted_at or NOW

with open(METADATA_FILE, "w") as fp:
json.dump(metadata, fp, indent=4, sort_keys=True, cls=CustomEncoder)

if data["flagged-cogs"]:
for url, flagged_cogs in data["flagged-cogs"].items():
for r in repos:
Expand Down