Merge pull request #611 from binpash/future

PaSh version 0.9
binpash · Jul 18, 2022 · ea962cc · ea962cc
2 parents 3b32506 + 01ec61f
commit ea962cc
Show file tree

Hide file tree

Showing 29 changed files with 470 additions and 118 deletions.
diff --git a/annotations/pr.json b/annotations/pr.json
@@ -10,7 +10,7 @@
             	[
             	    {
             		"operator": "exists",
-            		"operands": ["-T, --omit-pagination"],
+            		"operands": ["-T, --omit-pagination"]
             	    },
             	    {
             		"operator": "!exists",

diff --git a/annotations/rev.json b/annotations/rev.json
@@ -0,0 +1,12 @@
+{
+    "command": "rev",
+    "cases":
+    [
+        {
+            "predicate": "default",
+            "class": "stateless",
+            "inputs": ["stdin"],
+            "outputs": ["stdout"]
+        }
+    ]
+}
diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py
@@ -863,15 +863,15 @@ def preprocess_node_case(ast_node, irFileGen, config, last_object=False):
 ## If we are need to disable parallel pipelines, e.g., if we are in the context of an if,
 ## or if we are in the end of a script, then we set a variable.
 def replace_df_region(asts, irFileGen, config, disable_parallel_pipelines=False, ast_text=None):
-    _, ir_filename = ptempfile()
+    ir_filename = ptempfile()
 
     ## Serialize the node in a file
     with open(ir_filename, "wb") as ir_file:
         pickle.dump(asts, ir_file)
 
     ## Serialize the candidate df_region asts back to shell
     ## so that the sequential script can be run in parallel to the compilation.
-    _, sequential_script_file_name = ptempfile()
+    sequential_script_file_name = ptempfile()
     ## If we don't have the original ast text, we need to unparse the ast
     if (ast_text is None):
         kv_asts = [ast_node_to_untyped_deep(ast) for ast in asts]
@@ -911,12 +911,12 @@ def make_call_to_runtime(ir_filename, sequential_script_file_name,
 
     ## Save the input arguments
     ## ```
-    ## pash_input_args="$@"
+    ## source $PASH_TOP/runtime/save_args.sh "${@}"
     ## ```
-    assignments = [["pash_input_args",
-                    [make_quoted_variable("@")]]]
-    input_args_command = make_command([],
-                                      assignments=assignments)
+    arguments = [string_to_argument("source"),
+                 string_to_argument(config.SAVE_ARGS_EXECUTABLE),
+                 [make_quoted_variable("@")]]
+    input_args_command = make_command(arguments)
 
     ## Disable parallel pipelines if we are in the last command of the script.
     ## ```
@@ -942,15 +942,24 @@ def make_call_to_runtime(ir_filename, sequential_script_file_name,
 
     ## Restore the arguments to propagate internal changes, e.g., from `shift` outside.
     ## ```
-    ## set -- $pash_input_args
+    ## eval "set -- \"\${pash_input_args[@]}\""
     ## ```
     ##
+    ## Alternative Solution: (TODO if we need extra performance -- avoiding eval) 
+    ## Implement an AST node that accepts and returns a literal string
+    ## bypassing unparsing. This would make this simpler and also more
+    ## efficient (avoiding eval).
+    ## However, it would require some work because we would need to implement
+    ## support for this node in various places of PaSh and the unparser.
+    ##      
+    ##
     ## TODO: Maybe we need to only do this if there is a change.
-    set_arguments = [string_to_argument("set"),
-                     string_to_argument("--"),
-                     [standard_var_ast("pash_input_args")]]
+    ## 
+    set_arguments = [string_to_argument("eval"),
+                     [['Q', string_to_argument('set -- \\"\\${pash_input_args[@]}\\"')]]]
     set_args_node = make_command(set_arguments)
 
+
     ## Restore the exit code (since now we have executed `set` last)
     ## ```
     ## ( exit "$pash_runtime_final_status")

diff --git a/compiler/config.py b/compiler/config.py
@@ -10,7 +10,7 @@
 from util import *
 
 ## Global
-__version__ = "0.8" # FIXME add libdash version
+__version__ = "0.9" # FIXME add libdash version
 GIT_TOP_CMD = [ 'git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree']
 if 'PASH_TOP' in os.environ:
     PASH_TOP = os.environ['PASH_TOP']
@@ -20,6 +20,7 @@
 PYTHON_VERSION = "python3"
 PLANNER_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_runtime.py")
 RUNTIME_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_runtime.sh")
+SAVE_ARGS_EXECUTABLE = os.path.join(PASH_TOP, "runtime/save_args.sh")
 
 ## Ensure that PASH_TMP_PREFIX is set by pa.sh
 assert(not os.getenv('PASH_TMP_PREFIX') is None)
@@ -297,17 +298,97 @@ def reset_variable_cache():
 
     variable_cache = {}
 
+def is_array_variable(token):
+    return ('a' in token)
+
+## Based on the following:
+## https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html#ANSI_002dC-Quoting
+def ansi_c_expand(string):
+    return bytes(string, "utf-8").decode("unicode_escape")
 
 ## This finds the end of this variable/function
 def find_next_delimiter(tokens, i):
     if (tokens[i] == "declare"):
         return i + 3
     else:
+        ## TODO: When is this case actually useful?
         j = i + 1
         while j < len(tokens) and (tokens[j] != "declare"):
             j += 1
         return j
 
+def parse_array_variable(tokens, i):
+    ## The `declare` keyword
+    _declare = tokens[i]
+    ## The type
+    declare_type = tokens[i+1]
+    assert(is_array_variable(declare_type))
+
+    ## The variable name and first argument
+    ## TODO: Test with empty array and single value array
+    name_and_start=tokens[i+2]
+    first_equal_index = name_and_start.find('=')
+
+    ## If it doesn't contain any = then it is empty
+    if first_equal_index == -1:
+        ## Then the name is the whole token,
+        ##  the type is None (TODO)
+        ##  and the value is empty
+        return name_and_start, None, "", i+3
+
+    var_name = name_and_start[:first_equal_index]
+    array_start = name_and_start[first_equal_index+1:]
+
+    var_values = []
+    if array_start == "()":
+        next_i = i+3
+    else:
+        ## Remove the opening parenthesis
+        array_item = array_start[1:]
+
+        ## Set the index that points to array items
+        curr_i = i+2
+
+        done = False
+        while not done:
+            ## TODO: Is this check adequate? Or could it miss the end 
+            ##       (or be misleaded into an earlier end by the item value?)
+            if array_item.endswith(")"):
+                done = True
+                array_item = array_item[:-1]
+
+            first_equal_index = array_item.find('=')
+            ## Find the index and value of the array item
+            item_index_raw = array_item[:first_equal_index]
+            item_value = array_item[first_equal_index+1:]
+
+            ## Sometimes the value starts with a dollar mark, see Bash ANSI-C quoting:
+            ## https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html#ANSI_002dC-Quoting
+            if item_value.startswith("$"):
+                ## TODO: Figure out if this is adequate
+                item_value = ansi_c_expand(item_value[1:])
+
+            item_index = int(item_index_raw[1:-1])
+
+            ## Add None values if the index is larger than the next item (see Bash sparse arrays)
+            ## TODO: Keep bash array values as maps to avoid sparse costs 
+            var_values += [None] * (item_index - len(var_values))
+            ## Set the next item
+            var_values.append(item_value)
+
+
+
+            ## Get next array_item
+            curr_i += 1
+            array_item = tokens[curr_i]
+
+        next_i = curr_i
+
+    ## TODO: Michael?
+    var_type = None
+
+    return var_name, var_type, var_values, next_i
+
 ##
 ## Read a shell variables file
 ##
@@ -337,7 +418,7 @@ def read_vars_file(var_file_path):
             tokens = shlex.split(data)
             variable_tokenizing_end_time = datetime.now()
             print_time_delta("Variable Tokenizing", variable_tokenizing_start_time, variable_tokenizing_end_time)
-            # log(tokens)
+            # log("Tokens:", tokens)
 
         # MMG 2021-03-09 definitively breaking on newlines (e.g., IFS) and function outputs (i.e., `declare -f`)
         # KK  2021-10-26 no longer breaking on newlines (probably)
@@ -346,11 +427,17 @@ def read_vars_file(var_file_path):
         token_i = 0
         while token_i < len(tokens):
             # FIXME is this assignment needed?
-            _export_or_typeset = tokens[token_i]
+            export_or_typeset = tokens[token_i]
+
+            ## Array variables require special parsing treatment
+            if (export_or_typeset == "declare" and is_array_variable(tokens[token_i+1])):
+                var_name, var_type, var_value, new_token_i = parse_array_variable(tokens, token_i)
+                vars_dict[var_name] = (var_type, var_value)
+                token_i = new_token_i
+                continue
 
             new_token_i = find_next_delimiter(tokens, token_i)
             rest = " ".join(tokens[(token_i+1):new_token_i])
-            # log("Rest:", rest)
             token_i = new_token_i
 
             space_index = rest.find(' ')

diff --git a/compiler/dspash/ir_helper.py b/compiler/dspash/ir_helper.py
@@ -47,7 +47,7 @@ def save_configs(graph:IR, dfs_configs_paths: Dict[HDFSFileConfig, str]):
             resource : DFSSplitResource = edge.get_resource()
             config: HDFSFileConfig = resource.config
             if config not in dfs_configs_paths:
-                _, config_path = ptempfile()
+                config_path = ptempfile()
                 with open(config_path, "w") as f:
                     f.write(config)
                 dfs_configs_paths[config] = config_path
@@ -57,7 +57,7 @@ def save_configs(graph:IR, dfs_configs_paths: Dict[HDFSFileConfig, str]):
             resource.set_config_path(config_path)
 
 def to_shell_file(graph: IR, args) -> str:
-    _, filename = ptempfile()
+    filename = ptempfile()
 
     dirs = set()
     for edge in graph.all_fids():

diff --git a/compiler/dspash/worker.sh b/compiler/dspash/worker.sh
@@ -20,7 +20,7 @@ source "$PASH_TOP/compiler/pash_init_setup.sh" "$@" --distributed_exec
 
 export PASH_TMP_PREFIX="$(mktemp -d /tmp/pash_XXXXXXX)/"
 
-function cleanup() {
+cleanup() {
         kill "$FILEREADER_PID" "$DISCOVERY_PID"
         wait "$FILEREADER_PID" "$DISCOVERY_PID" 2>/dev/null
         rm -rf "$PASH_TMP_PREFIX"

diff --git a/compiler/expand.py b/compiler/expand.py
@@ -255,20 +255,21 @@ def lookup_variable(var, _lookup_config):
             return None, var_value
 
     if(var == '@'):
-        expanded_var = lookup_variable_inner('pash_input_args')
+        argument_values = lookup_variable_inner_core('pash_input_args')
+        expanded_var = " ".join(argument_values)
     elif(var == '?'):
         expanded_var = lookup_variable_inner('pash_previous_exit_status')
     elif(var == '-'):
         expanded_var = lookup_variable_inner('pash_previous_set_status')
     elif(var == '#'):
-        input_args = lookup_variable_inner('pash_input_args')
-        expanded_var = str(len(input_args.split()))
+        argument_values = lookup_variable_inner_core('pash_input_args')
+        expanded_var = str(len(argument_values))
     elif(var.isnumeric() and int(var) >= 1):
-        input_args = lookup_variable_inner('pash_input_args')
-        split_args = input_args.split()
+        input_args = lookup_variable_inner_core('pash_input_args')
+        # split_args = input_args.split()
         index = int(var) - 1
         try:
-            expanded_var = split_args[index]
+            expanded_var = input_args[index]
         except:
             ## If there are not enough arguments -u is set we need to raise
             if is_u_set():
@@ -287,8 +288,18 @@ def lookup_variable(var, _lookup_config):
 
     return None, expanded_var
 
-## Looks up the variable and if it is unset it raises an error
+## Looksup a variable and flattens it if it is an array 
 def lookup_variable_inner(varname):
+    value = lookup_variable_inner_core(varname)
+    if value is not None and not isinstance(value, str):
+        ## TODO: This is not handled at the moment (and it is unclear if it should be).
+        ##
+        ## This is only returned when we are in an array
+        raise Unimplemented("Expanded value is not None or a string", (varname, value))
+    return value
+
+## Looks up the variable and if it is unset it raises an error
+def lookup_variable_inner_core(varname):
     value = lookup_variable_inner_unsafe(varname)
     if value is None and is_u_set():
         raise StuckExpansion("-u is set and variable was unset", varname)

diff --git a/compiler/ir_utils.py b/compiler/ir_utils.py
@@ -162,7 +162,8 @@ def string_to_arguments(string):
     return [string_to_argument(word) for word in string.split(" ")]
 
 def string_to_argument(string):
-    return [char_to_arg_char(char) for char in string]
+    ret = [char_to_arg_char(char) for char in string]
+    return ret
 
 ## FIXME: This is certainly not complete. It is used to generate the
 ## AST for the call to the distributed planner. It only handles simple
@@ -193,6 +194,10 @@ def make_background(body, redirections=[]):
     node = make_kv("Background", [lineno, body, redirections])
     return node
 
+def make_backquote(node):
+    node = make_kv("B", node)
+    return node
+
 def make_subshell(body, redirections=[]):
     lineno = 0
     node = make_kv("Subshell", [lineno, body, redirections])

diff --git a/compiler/parser/ceda/ast2a.py b/compiler/parser/ceda/ast2a.py
@@ -163,25 +163,25 @@ def mk_file (ty, n):
     return ["File", [ty, n.nfile.fd, arg]];
 
 
-def mk_dup (ty, n):
-    ndup = n.ndup;
-    vname = ndup.vname;
-
-    tgt = [];
+def mk_dup(ty, n):
+    ndup = n.ndup
+    vname = ndup.vname
+    
+    tgt = []
 
     if (not vname):
-        dupfd = ndup.dupfd;
+        dupfd = ndup.dupfd
         if (dupfd == -1):
-            tgt.append (["C", ORD_MINUS]);
+            tgt.append(["C", ORD_MINUS])
         else:
-            dupfd_str = str (dupfd);
+            dupfd_str = str(dupfd)
 
-            for i in range (len (dupfd_str)):
-                tgt.append (["C", ord (dupfd_str [i])]);
+            for i in range(len(dupfd_str)):
+                tgt.append(["C", ord(dupfd_str[i])])
     else:
-        tgt = to_arg (vname.narg);
+        tgt = to_arg(vname.contents.narg)
 
-    return (["Dup", [ty, ndup.fd, tgt]]);
+    return (["Dup", [ty, ndup.fd, tgt]])
 
 
 def mk_here (ty, n):

diff --git a/compiler/parser/ceda/ast2shell.py b/compiler/parser/ceda/ast2shell.py
@@ -332,9 +332,9 @@ def string_of_arg_char (c, is_quoted=False):
 
         ## MMG 2021-09-20 It might be safe to move everything except for " in the second list, but no need to do it if the tests pass 
         ## Chars to escape unconditionally
-        chars_to_escape = ["'", '"', '`', '(', ')', '{', '}', '$', '!', '&', '|', ';']
+        chars_to_escape = ["'", '"', '`', '(', ')', '{', '}', '$', '&', '|', ';']
         ## Chars to escape only when not quoted
-        chars_to_escape_when_no_quotes = ['*', '?', '[', ']', '#', '<', '>', '~', ' ']
+        chars_to_escape_when_no_quotes = ['*', '?', '[', ']', '#', '<', '>', '~', '!', ' ']
         if char in chars_to_escape:
             return '\\' + char
         elif char in chars_to_escape_when_no_quotes and not is_quoted:
@@ -393,7 +393,7 @@ def string_of_arg_char (c, is_quoted=False):
 #   | [] -> ""
 #   | c :: a -> string_of_arg_char c ^ string_of_arg a
 def string_of_arg (args, is_quoted=False):
-    # print (args);
+    # print ("Unparsing:", args, " -- is quoted:", is_quoted)
 
     i = 0
     text = []
@@ -414,7 +414,9 @@ def string_of_arg (args, is_quoted=False):
 
     text = "".join(text)
 
-    return (text);
+    # print("To text:", text)
+
+    return (text)
 
 
 # and string_of_assign (v,a) = v ^ "=" ^ string_of_arg a