Skip to content

Commit

Permalink
perf: Improve performance when handling large yaml files (meltano#6644)
Browse files Browse the repository at this point in the history
* Configure yaml at import time

* Check against `Mapping` and `Sequence` instead of `dict` and `list`

* Use `copy.copy` instead of `copy.deepcopy` in `meltano.core.project_files.deep_merge`

* Add load with caching to `meltano.core.yaml`

* Simplify locking in `src/meltano/core/project.py`

* Use `cached_property` for `Project.project_files`

* Add caching to `ProjectFiles.load` to avoid `deep_merge`

* Add LRU cache to `Canonical.parse`

* Make `integration/validate.sh` executable

* Add directory for `meltano-basis` output

Without this directory existing before I run the integration test, the test fails.

* Add sample large yaml files for testing

* Fix `__hash__` and `__eq__` methods of `IdHashBox`

* Format test yaml files

* Call proper `Canonical` subclass method

* Add `Project.clear_cache`

* Fix `test_remove_environment` and `test_list_environments`

* Fix `test_get_plugin`

* Fix `TestTaskSetsService::test_remove`

* Make test large YAML file smaller

They're too large for these changes to handle. Eventually it would be good for us to update Meltano to be able to handle such large YAML files without issue, but that'll have to come in the future.

* Add basic performance test for project with large YAML files

* Set acceptable time in basic perf test higher

* Fix extra newlines added on Windows

* Explain why you would want to clear the `project_files` cache
  • Loading branch information
WillDaSilva committed Aug 24, 2022
1 parent 0dfe536 commit 40a8066
Show file tree
Hide file tree
Showing 25 changed files with 1,683 additions and 220 deletions.
Empty file.
Empty file modified integration/validate.sh
100644 → 100755
Empty file.
92 changes: 68 additions & 24 deletions src/meltano/core/behavior/canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import copy
import json
from functools import lru_cache
from os import PathLike
from typing import Any, TypeVar

Expand All @@ -14,6 +15,41 @@
T = TypeVar("T", bound="Canonical") # noqa: WPS111 (name too short)


class IdHashBox:
"""Wrapper class that makes the hash of an object its Python ID."""

def __init__(self, content: Any):
"""Initialize the `IdHashBox`.
Parameters:
content: The object that will be stored within the `IdHashBox`.
Its Python ID will be used to hash the `IdHashBox` instance.
"""
self.content = content

def __hash__(self) -> int:
"""Hash the `IdHashBox` according to the Python ID of its content.
Returns:
The Python ID of the content of the `IdHashBox` instance.
"""
return id(self.content)

def __eq__(self, other: Any) -> bool:
"""Check equality of this instance and some other object.
Parameters:
other: The object to check equality with.
Returns:
Whether this instance and `other` have the same hash.
"""
return hash(self) == hash(other)


CANONICAL_PARSE_CACHE_SIZE = 4096


class Canonical: # noqa: WPS214 (too many methods)
"""Defines an object that can be represented as a subset of its attributes.
Expand Down Expand Up @@ -60,14 +96,12 @@ def as_canonical(
Canonical representation of the given instance.
"""
if isinstance(target, Canonical):
result = CommentedMap(
[(key, Canonical.as_canonical(val)) for key, val in target]
)
result = CommentedMap([(key, cls.as_canonical(val)) for key, val in target])
target.attrs.copy_attributes(result)
return result

if isinstance(target, (CommentedSet, CommentedSeq)):
result = CommentedSeq(Canonical.as_canonical(val) for val in target)
result = CommentedSeq(cls.as_canonical(val) for val in target)
target.copy_attributes(result)
return result

Expand All @@ -77,24 +111,20 @@ def as_canonical(
if isinstance(val, Canonical):
results[key] = val.canonical()
else:
results[key] = Canonical.as_canonical(val)
results[key] = cls.as_canonical(val)
target.copy_attributes(results)
return results

if isinstance(target, set):
return list(map(Canonical.as_canonical, target))

if isinstance(target, list):
return list(map(Canonical.as_canonical, target))
if isinstance(target, (list, set)):
return list(map(cls.as_canonical, target))

if isinstance(target, dict):
results = {}
for key, val in target.items():
if isinstance(val, Canonical):
results[key] = val.canonical()
else:
results[key] = Canonical.as_canonical(val)
return results
return {
key: val.canonical()
if isinstance(val, Canonical)
else cls.as_canonical(val)
for key, val in target.items()
}

return copy.deepcopy(target)

Expand All @@ -104,7 +134,7 @@ def canonical(self) -> dict | list | CommentedMap | CommentedSeq | Any:
Returns:
A canonical representation of the current instance.
"""
return Canonical.as_canonical(self)
return type(self).as_canonical(self)

def with_attrs(self: T, *args: Any, **kwargs: Any) -> T:
"""Return a new instance with the given attributes set.
Expand All @@ -116,7 +146,7 @@ def with_attrs(self: T, *args: Any, **kwargs: Any) -> T:
Returns:
A new instance with the given attributes set.
"""
return self.__class__(**{**self.canonical(), **kwargs})
return type(self)(**{**self.canonical(), **kwargs})

@classmethod
def parse(cls: type[T], obj: Any) -> T:
Expand All @@ -128,6 +158,21 @@ def parse(cls: type[T], obj: Any) -> T:
Returns:
Parsed instance.
"""
return cls._parse(IdHashBox(obj))

@classmethod
@lru_cache(maxsize=CANONICAL_PARSE_CACHE_SIZE)
def _parse(cls: type[T], boxed_obj: IdHashBox) -> T:
"""Parse a `Canonical` object from a dictionary or return the instance.
Args:
boxed_obj: Dictionary or instance to parse wrapped in an `IdHashBox`.
Returns:
Parsed instance.
"""
obj = boxed_obj.content

if obj is None:
return None

Expand Down Expand Up @@ -202,7 +247,7 @@ def __setattr__(self, attr: str, value: Any):
attr: Attribute to set.
value: Value to set.
"""
if attr.startswith("_") or hasattr(self.__class__, attr): # noqa: WPS421
if attr.startswith("_") or hasattr(type(self), attr): # noqa: WPS421
super().__setattr__(attr, value)
else:
self._dict[attr] = value
Expand Down Expand Up @@ -288,7 +333,7 @@ def update(self, *others: Any, **kwargs: Any) -> None:
others = [*others, kwargs]

for other in others:
other = Canonical.as_canonical(other)
other = type(self).as_canonical(other)

for key, val in other.items():
setattr(self, key, val)
Expand All @@ -305,7 +350,7 @@ def yaml(cls, dumper: yaml.BaseDumper, obj: Any) -> yaml.MappingNode:
The serialized YAML representation of the object.
"""
return dumper.represent_mapping(
"tag:yaml.org,2002:map", Canonical.as_canonical(obj), flow_style=False
"tag:yaml.org,2002:map", cls.as_canonical(obj), flow_style=False
)

@classmethod
Expand All @@ -320,8 +365,7 @@ def to_yaml(cls, representer: Representer, obj: Any):
The serialized YAML representation of the object.
"""
return representer.represent_mapping(
"tag:yaml.org,2002:map",
Canonical.as_canonical(obj),
"tag:yaml.org,2002:map", cls.as_canonical(obj)
)

@classmethod
Expand Down
6 changes: 3 additions & 3 deletions src/meltano/core/config_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
import yaml

from meltano.core import bundle

from .project import Project
from .setting_definition import SettingDefinition
from meltano.core.project import Project
from meltano.core.setting_definition import SettingDefinition

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -53,6 +52,7 @@ def current_meltano_yml(self):
The contents of meltano.yml.
"""
if self._current_meltano_yml is None or not self._use_cache:
self.project.clear_cache()
self._current_meltano_yml = self.project.meltano
return self._current_meltano_yml

Expand Down
4 changes: 2 additions & 2 deletions src/meltano/core/environment_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, project: Project):
"""Create a new EnvironmentService for a Meltano Project.
Args:
A Meltano Project.
project: A Meltano Project.
"""
self.project = project

Expand Down Expand Up @@ -71,7 +71,7 @@ def list_environments(self) -> list[Environment]:
Returns:
A list of Environment objects.
"""
return self.project.meltano.environments
return self.project.meltano.environments.copy()

def remove(self, name: str) -> str:
"""Remove an Environment by name.
Expand Down
8 changes: 0 additions & 8 deletions src/meltano/core/plugin/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,6 @@ def expanded_args(self, name, env):

return expanded

def canonical(self):
"""Serialize the command.
Returns:
Python object.
"""
return Command.as_canonical(self)

@classmethod
def as_canonical(cls, target):
"""Serialize the target command.
Expand Down
21 changes: 9 additions & 12 deletions src/meltano/core/plugin_discovery_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,17 @@

import meltano
from meltano.core import bundle
from meltano.core.behavior.versioned import IncompatibleVersionError, Versioned
from meltano.core.discovery_file import DiscoveryFile
from meltano.core.plugin import BasePlugin, PluginDefinition, PluginRef, PluginType
from meltano.core.plugin.base import StandalonePlugin
from meltano.core.plugin.error import PluginNotFoundError
from meltano.core.plugin.factory import base_plugin_factory
from meltano.core.plugin.project_plugin import ProjectPlugin
from meltano.core.project import Project
from meltano.core.yaml import configure_yaml

from .behavior.versioned import IncompatibleVersionError, Versioned
from .discovery_file import DiscoveryFile
from .plugin import BasePlugin, PluginDefinition, PluginRef, PluginType
from .plugin.error import PluginNotFoundError
from .plugin.factory import base_plugin_factory
from .plugin.project_plugin import ProjectPlugin
from .project_settings_service import ProjectSettingsService
from .utils import NotFound, find_named

yaml = configure_yaml()
from meltano.core.project_settings_service import ProjectSettingsService
from meltano.core.utils import NotFound, find_named
from meltano.core.yaml import yaml


class DiscoveryInvalidError(Exception):
Expand Down

0 comments on commit 40a8066

Please sign in to comment.