Merge 6f9ec85 into fa8e247

4dn-dcic · Apr 28, 2020 · 012c908 · 012c908
2 parents fa8e247 + 6f9ec85
commit 012c908
Show file tree

Hide file tree

Showing 20 changed files with 567 additions and 72 deletions.
diff --git a/dcicutils/deployment_utils.py b/dcicutils/deployment_utils.py
@@ -30,7 +30,7 @@ def main():
 import argparse
 
 from dcicutils.env_utils import (
-    is_stg_or_prd_env, prod_bucket_env, get_standard_mirror_env, data_set_for_env,
+    is_stg_or_prd_env, prod_bucket_env, get_standard_mirror_env, data_set_for_env, INDEXER_ENVS
 )
 from dcicutils.misc_utils import PRINT
 
@@ -44,7 +44,7 @@ class Deployer:
     @classmethod
     def build_ini_file_from_template(cls, template_file_name, init_file_name,
                                      bs_env=None, bs_mirror_env=None, s3_bucket_env=None,
-                                     data_set=None, es_server=None, es_namespace=None):
+                                     data_set=None, es_server=None, es_namespace=None, indexer=False):
         """
         Builds a .ini file from a given template file.
 
@@ -57,6 +57,7 @@ def build_ini_file_from_template(cls, template_file_name, init_file_name,
             data_set (str): An identifier for data to load (either 'prod' for prd/stg envs, or 'test' for others)
             es_server (str): The server name (or server:port) for the ElasticSearch server.
             es_namespace (str): The ElasticSearch namespace to use (probably but not necessarily same as bs_env).
+            indexer (bool): Whether or not we are building an ini file for an indexer.
         """
         with io.open(init_file_name, 'w') as init_file_fp:
             cls.build_ini_stream_from_template(template_file_name=template_file_name,
@@ -66,7 +67,8 @@ def build_ini_file_from_template(cls, template_file_name, init_file_name,
                                                s3_bucket_env=s3_bucket_env,
                                                data_set=data_set,
                                                es_server=es_server,
-                                               es_namespace=es_namespace)
+                                               es_namespace=es_namespace,
+                                               indexer=indexer)
 
     # Ref: https://stackoverflow.com/questions/19911123/how-can-you-get-the-elastic-beanstalk-application-version-in-your-application  # noqa: E501
     EB_MANIFEST_FILENAME = "/opt/elasticbeanstalk/deploy/manifest"
@@ -104,7 +106,7 @@ def get_app_version(cls):  # This logic (perhaps most or all of this file) shoul
     @classmethod
     def build_ini_stream_from_template(cls, template_file_name, init_file_stream,
                                        bs_env=None, bs_mirror_env=None, s3_bucket_env=None, data_set=None,
-                                       es_server=None, es_namespace=None):
+                                       es_server=None, es_namespace=None, indexer=False):
         """
         Sends output to init_file_stream corresponding to the data noe would want in an ini file
         for the given template_file_name and available environment variables.
@@ -118,13 +120,13 @@ def build_ini_stream_from_template(cls, template_file_name, init_file_stream,
             data_set: 'test' or 'prod'. Default is 'test' unless bs_env is a staging or production environment.
             es_server: The name of an es server to use.
             es_namespace: The namespace to use on the es server. If None, this uses the bs_env.
+            indexer: Whether or not we are building an ini file for an indexer.
 
         Returns: None
 
         """
 
         # print("data_set given = ", data_set)
-
         es_server = es_server or os.environ.get('ENCODED_ES_SERVER', "MISSING_ENCODED_ES_SERVER")
         bs_env = bs_env or os.environ.get("ENCODED_BS_ENV", "MISSING_ENCODED_BS_ENV")
         bs_mirror_env = bs_mirror_env or os.environ.get("ENCODED_BS_MIRROR_ENV", get_standard_mirror_env(bs_env)) or ""
@@ -135,6 +137,11 @@ def build_ini_stream_from_template(cls, template_file_name, init_file_stream,
         data_set = data_set or os.environ.get("ENCODED_DATA_SET",
                                               data_set_for_env(bs_env) or "MISSING_ENCODED_DATA_SET")
         es_namespace = es_namespace or os.environ.get("ENCODED_ES_NAMESPACE", bs_env)
+        # Set ENCODED_INDEXER to 'true' to deploy an indexer.
+        # If the value is missing, the empty string, or any other thing besides 'true' (in any case),
+        # this value will default to the empty string, causing the line not to appear in the output file
+        # because there is a special case that suppresses output of empty values. -kmp 27-Apr-2020
+        indexer = "true" if indexer or os.environ.get('ENCODED_INDEXER', "false").upper() == "TRUE" else ""
 
         # print("data_set computed = ", data_set)
 
@@ -147,8 +154,15 @@ def build_ini_stream_from_template(cls, template_file_name, init_file_stream,
             'S3_BUCKET_ENV': s3_bucket_env,
             'DATA_SET': data_set,
             'ES_NAMESPACE': es_namespace,
+            'INDEXER': indexer,
         }
 
+        # if we specify an indexer name for bs_env, we did the deployment wrong and should bail
+        if bs_env in INDEXER_ENVS:
+            raise RuntimeError("Deployed with bs_env %s, which is an indexer env."
+                               "Re-deploy with the env you want to index and set the 'ENCODED.INDEXER'"
+                               "environment variable." % bs_env)
+
         # We assume these variables are not set, but best to check first. Confusion might result otherwise.
         for extra_var in extra_vars:
             if extra_var in os.environ:
@@ -239,6 +253,10 @@ def main(cls):
             parser.add_argument("--es_namespace",
                                 help="an ElasticSearch namespace",
                                 default=None)
+            parser.add_argument("--indexer",
+                                help="whether or not to deploy an indexer",
+                                action='store_true',
+                                default=False)
             args = parser.parse_args()
             template_file_name = cls.environment_template_filename(args.env)
             ini_file_name = args.target
@@ -247,7 +265,8 @@ def main(cls):
             cls.build_ini_file_from_template(template_file_name, ini_file_name,
                                              bs_env=args.bs_env, bs_mirror_env=args.bs_mirror_env,
                                              s3_bucket_env=args.s3_bucket_env, data_set=args.data_set,
-                                             es_server=args.es_server, es_namespace=args.es_namespace)
+                                             es_server=args.es_server, es_namespace=args.es_namespace,
+                                             indexer=args.indexer)
         except Exception as e:
             PRINT("Error (%s): %s" % (e.__class__.__name__, e))
             sys.exit(1)

diff --git a/dcicutils/env_utils.py b/dcicutils/env_utils.py
@@ -11,6 +11,7 @@
 FF_ENV_WEBPROD = 'fourfront-webprod'
 FF_ENV_WEBPROD2 = 'fourfront-webprod2'
 FF_ENV_WOLF = 'fourfront-wolf'
+FF_ENV_INDEXER = 'fourfront-indexer'  # to be used by ELB Indexer
 
 CGAP_ENV_DEV = 'fourfront-cgapdev'
 CGAP_ENV_HOTSEAT = 'fourfront-cgaphotseat'  # Maybe not used
@@ -22,6 +23,7 @@
 CGAP_ENV_WEBPROD = 'fourfront-cgap'
 # CGAP_ENV_WEBPROD2 is meaningless here. See CGAP_ENV_STAGING.
 CGAP_ENV_WOLF = 'fourfront-cgapwolf'  # Maybe not used
+CGAP_ENV_INDEXER = 'cgap-indexer'  # to be used by ELB Indexer
 
 CGAP_ENV_DEV_NEW = 'cgap-dev'
 CGAP_ENV_HOTSEAT_NEW = 'cgap-hotseat'
@@ -43,6 +45,9 @@
 FOURFRONT_STG_OR_PRD_TOKENS = ['webprod', 'blue', 'green']
 FOURFRONT_STG_OR_PRD_NAMES = ['staging', 'stagging', 'data']
 
+# We should know which BS Envs are indexing envs
+INDEXER_ENVS = [FF_ENV_INDEXER, CGAP_ENV_INDEXER]
+
 # Done this way because it's safer going forward.
 CGAP_STG_OR_PRD_TOKENS = []
 CGAP_STG_OR_PRD_NAMES = [CGAP_ENV_WEBPROD, CGAP_ENV_PRODUCTION_GREEN, CGAP_ENV_PRODUCTION_BLUE,

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -12,6 +12,7 @@
 )
 from .misc_utils import PRINT
 import requests
+from elasticsearch.exceptions import AuthorizationException
 # urlparse import differs between py2 and 3
 if sys.version_info[0] < 3:
     import urlparse
@@ -695,7 +696,7 @@ def delete_field(obj_id, del_field, key=None, ff_env=None):
 
 def get_es_search_generator(es_client, index, body, page_size=200):
     """
-    Simple generator behind get_es_metada which takes an es_client (from
+    Simple generator behind get_es_metadata which takes an es_client (from
     es_utils create_es_client), a string index, and a dict query body.
     Also takes an optional string page_size, which controls pagination size
     NOTE: 'index' must be namespaced
@@ -881,7 +882,8 @@ def expand_es_metadata(uuid_list, key=None, ff_env=None, store_frame='raw', add_
         add_pc_wfr (bool):               Include workflow_runs and linked items (processed/ref files, wf, software...)
         ignore_field(list):              Remove keys from items, so any linking through these fields, ie relations
         use_generator (bool):            Use a generator when getting es. Less memory used but takes longer
-        es_client:                       optional result from es_utils.create_es_client
+        es_client:                       optional result from es_utils.create_es_client - note this could be regenerated
+                                         in this method if the signature expires
     Returns:
         dict: contains all item types as keys, and with values of list of dictionaries
               i.e.
@@ -928,8 +930,21 @@ def remove_keys(my_dict, remove_list):
 
     while uuid_list:
         uuids_to_check = []  # uuids to add to uuid_list if not if not in item_uuids
-        for es_item in get_es_metadata(uuid_list, es_client=es_client, chunk_size=chunk,
-                                       is_generator=use_generator, key=auth):
+
+        # get the next page of data, recreating the es_client if need be
+        try:
+            current_page = get_es_metadata(uuid_list, es_client=es_client, chunk_size=chunk,
+                                           is_generator=use_generator, key=auth)
+        except AuthorizationException:  # our signature expired, recreate the es_client with a fresh signature
+            if es_url:
+                es_client = es_utils.create_es_client(es_url, use_aws_auth=True)
+            else:  # recreate client and try again - if we fail here, exception should propagate
+                es_url = get_health_page(key=auth)['elasticsearch']
+                es_client = es_utils.create_es_client(es_url, use_aws_auth=True)
+
+            current_page = get_es_metadata(uuid_list, es_client=es_client, chunk_size=chunk,
+                                           is_generator=use_generator, key=auth)
+        for es_item in current_page:
             # get object type via es result and schema for storing
             obj_type = es_item['object']['@type'][0]
             obj_key = schema_name[obj_type]

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
@@ -3,6 +3,13 @@
 """
 
 import os
+import logging
+import webtest  # importing the library makes it easier to mock testing
+
+
+# Is this the right place for this? I feel like this should be done in an application, not a library.
+# -kmp 27-Apr-2020
+logging.basicConfig()
 
 
 # Using PRINT(...) for debugging, rather than its more familiar lowercase form) for intended programmatic output,
@@ -11,6 +18,64 @@
 PRINT = print
 
 
+class VirtualApp():
+    """
+    Wrapper class for TestApp, to allow custom control over submitting Encoded requests,
+    simulating a number of conditions, including permissions.
+
+    IMPORTANT: We use webtest.TestApp is used as substrate technology here, but use of this class
+        occurs in the main application, not just in testing. Among other things, we have
+        renamed the app here in order to avoid confusions created by the name when it is used
+        in production settings.
+    """
+
+    def __init__(self, app, environ):
+        """
+        Builds an encoded application, allowing you to submit requests to an encoded application
+
+        :param app: return value of get_app(config_uri, app_name)
+        :param environ: options to pass to the application. Usually permissions.
+        """
+        #  NOTE: The TestApp class that we're wrapping takes a richer set of initialization parameters
+        #        (including relative_to, use_unicode, cookiejar, parser_features, json_encoder, and lint),
+        #        but we'll add them conservatively here. If there is a need for any of them, we should add
+        #        them explicitly here one-by-one as the need is shown so we have tight control of what
+        #        we're depending on and what we're not. -kmp 27-Apr-2020
+        self.wrapped_app = webtest.TestApp(app, environ)
+
+    def get(self, url, **kwargs):
+        """ Wrapper for TestApp.get that logs the outgoing GET
+
+        :param url: url to GET
+        :param kwargs: args to pass to the GET
+        :return: result of GET
+        """
+        logging.info('OUTGOING HTTP GET: %s' % url)
+        return self.wrapped_app.get(url, **kwargs)
+
+    def post_json(self, url, obj, **kwargs):
+        """ Wrapper for TestApp.post_json that logs the outgoing POST
+
+        :param url: url to POST to
+        :param obj: object body to POST
+        :param kwargs: args to pass to the POST
+        :return: result of POST
+        """
+        logging.info('OUTGOING HTTP POST on url: %s with object: %s' % (url, obj))
+        return self.wrapped_app.post_json(url, obj, **kwargs)
+
+    def patch_json(self, url, fields, **kwargs):
+        """ Wrapper for TestApp.patch_json that logs the outgoing PATCH
+
+        :param url: url to PATCH to, should contain an object uuid
+        :param fields: fields to PATCH on uuid in URL
+        :param kwargs: args to pass to the PATCH
+        :return: result of PATCH
+        """
+        logging.info('OUTGOING HTTP PATCH on url: %s with changes: %s' % (url, fields))
+        return self.wrapped_app.patch_json(url, fields, **kwargs)
+
+
 def ignored(*args, **kwargs):
     """
     This is useful for defeating flake warnings.