In [1]:
SYFT_VERSION = ">=0.8.2.b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
%pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import syft as sy
sy.requires(SYFT_VERSION)



✅ The installed version of syft==0.8.2b6 matches the requirement >=0.8.2b0 and the requirement <0.9


In [3]:
node = sy.orchestra.launch(name="blue-book", port="auto", dev_mode=True, reset=True)

Starting blue-book server on 0.0.0.0:3652

Waiting for server to start

INFO:     Started server process [562071]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:3652 (Press CTRL+C to quit)


INFO:     127.0.0.1:56522 - "GET /api/v2/metadata HTTP/1.1" 200 OK
. Done.


In [4]:
domain_client = node.login(email="info@openmined.org", password="changethis")

INFO:     127.0.0.1:56528 - "GET /api/v2/metadata HTTP/1.1" 200 OK
INFO:     127.0.0.1:56528 - "GET /api/v2/metadata HTTP/1.1" 200 OK
INFO:     127.0.0.1:56528 - "POST /api/v2/login HTTP/1.1" 200 OK
INFO:     127.0.0.1:56528 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
INFO:     127.0.0.1:56528 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
INFO:     127.0.0.1:56542 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:56552 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:56560 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:56574 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:56528 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


Logged into <blue-book: High side Domain> as <info@openmined.org>


## Add Sky Pilot Container

In [5]:
skypilot_cli_dockerfile = """
FROM python:3.9-slim

ENV GOOGLE_APPLICATION_CREDENTIALS=/tmp/key.json

RUN apt-get update && apt-get upgrade -y
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    curl python3-dev gcc make build-essential cmake git rsync ssh

RUN pip install -U pip skypilot[gcp]==0.3.3
RUN pip install google-api-python-client
RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
RUN ln -s /root/google-cloud-sdk/bin/gcloud /usr/local/bin
RUN mkdir -p /root/.sky
RUN touch /root/.sky/ssh_config
RUN mkdir -p /root/.ssh
RUN ln -s /root/.sky/ssh_config /root/.ssh/config
RUN echo '#!/bin/bash' >> /start.sh
RUN echo 'echo $PATH' >> /start.sh
RUN echo 'gcloud --version' >> /start.sh
RUN echo 'sky --version' >> /start.sh
RUN echo 'gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS' >> /start.sh
RUN echo 'gcloud config set project peaceful-crane-394607' >> /start.sh
RUN echo 'sky check' >> /start.sh
"""

In [6]:
volumes = [
    sy.ContainerVolume(
        name="skypilot_data",
        internal_mountpath="/root/.sky",
        mode="rw"
    ),
    # sy.ContainerVolume(
    #     name="skypilot_keys",
    #     internal_mountpath="/root/.ssh",
    #     mode="rw"
    # )
]

In [7]:
result = domain_client.api.services.container.add_image(
    name="skypilot",
    tag="skypilot:latest",
    dockerfile=skypilot_cli_dockerfile,
    volumes=volumes,
)
result

In [8]:
result = domain_client.api.services.container.get_images()
result

## Build a Container

In [9]:
result = domain_client.api.services.container.build_image(name="skypilot")
assert result

## Create a Container Command

In [10]:
cluster_launch_yaml_file = sy.SyftFile.from_string(content="""
resources:
  cloud: gcp
  region: europe-central2
  instance_type: n1-standard-8
  zone: europe-central2-b
  accelerators: T4
  disk_size: 100

num_nodes: 1

file_mounts:
  /storage:
    name: user-bucket-teo-test-1
    store: gcs
    mode: MOUNT

workdir: /sandbox

setup: |
  git clone https://github.com/madhavajay/axolotl
  cd axolotl
  git checkout madhava/add_direct_inference
  pip install -e .

  accelerate config --config_file configs/accelerate/default_config.yaml default

  pip install bitsandbytes
  pip install git+https://github.com/huggingface/transformers.git
  pip install git+https://github.com/huggingface/peft.git
  pip install git+https://github.com/huggingface/accelerate.git
  pip install pytest

  export WANDB_MODE=offline

run: |
  echo "Done"
  
""", filename="cluster_launch.yaml")

In [11]:
# sky launch -n test-llm -c single-t4 -s deployment.yaml

In [12]:
name = sy.ContainerCommandKwarg(name="n", hyphens="-", equals=" ", value=str, required=True)
cluster = sy.ContainerCommandKwarg(name="c", hyphens="-", equals=" ", value=str, required=True)
dryrun = sy.ContainerCommandKwargBool(name="dryrun", value=False, flag=True)
yes = sy.ContainerCommandKwargBool(name="yes", value=True, flag=True)
detatch = sy.ContainerCommandKwargBool(name="detach-run", value=True, flag=True)
nosetup = sy.ContainerCommandKwargBool(name="no-setup", value=False, flag=True)

In [13]:
upload = sy.ContainerUpload(arg_name="s")
file = sy.ContainerCommandKwarg(name="s", hyphens="-", equals=" ", value=upload, required=True)

In [14]:
kwargs = {
    "n": name,
    "c": cluster,
    "s": file,
    "dryrun": dryrun,
    "yes": yes,
    "detatch":detatch,
    "no-setup":nosetup
}

In [15]:
gcp_key = sy.ContainerMount(
    internal_filepath="/tmp/key.json",
    file=sy.SyftFile.from_path("~/Downloads/peaceful-crane-394607-2b92e7144a02.json")
)
gcp_key

```python
class ContainerMount:
  id: str = a8e6058c642646cc8b81f17f11f32705

```

In [16]:
sky_private_key = sy.ContainerMount(
    internal_filepath="/root/.ssh/sky-key",
    file=sy.SyftFile.from_path("~/.ssh/sky-key"),
    unix_permission="400",
)
sky_private_key

```python
class ContainerMount:
  id: str = 0119821b6bb44bf9bcdf87cfa02a31bf

```

In [17]:
sky_public_key = sy.ContainerMount(
    internal_filepath="/root/.ssh/sky-key.pub",
    file=sy.SyftFile.from_path("~/.ssh/sky-key.pub")
)
sky_public_key

```python
class ContainerMount:
  id: str = fbc57c91348d42a7b78e38afc1e66892

```

In [18]:
command = sy.ContainerCommand(
    module_name="blue_book",
    name="launch",
    image_name="skypilot",
    command="sky",
    args="launch",
    kwargs=kwargs,
    user_kwargs=["n", "c", "s", "dryrun", "no-setup"],
    mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [19]:
command

```python
class ContainerCommand:
  id: str = eea1a4d2529e412cbbadbce6a9369e82
  module_name: str = "blue_book"
  name: str = "launch"
  image_name: str = "skypilot"

```

In [20]:
run_user_kwargs = {
    "n": "test-llm",
    "c": "single-t4",
    "dryrun": True,
    "no-setup": True,
}

In [21]:
command.cmd(run_user_kwargs=run_user_kwargs, run_files={"s": cluster_launch_yaml_file})

value False
second value True
value True
value True
value False
second value True


'sky launch -n test-llm -c single-t4 -s /sandbox/cluster_launch.yaml --dryrun --yes --detach-run --no-setup'

In [22]:
result = domain_client.api.services.container.add_command(command=command)
result

[<Parameter "n: str">, <Parameter "c: str">, <Parameter "s: syft.types.file.SyftFile">, <Parameter "dryrun: Optional[bool]">, <Parameter "no_setup: Optional[bool]">]


INFO:     127.0.0.1:56528 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
value False
second value False
value True
value True
value False
second value True
> running cmd sky launch -n test-llm -c single-t4 -s /sandbox/cluster_launch.yaml --yes --detach-run --no-setup
INFO:     127.0.0.1:56576 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:40222 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:40230 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


In [23]:
# assert False

In [24]:
# result = domain_client.api.services.blue_book.launch(
#     n="test-llm",
#     c="single-t4",
#     s=cluster_launch_yaml_file,
#     dryrun=True
# )
# result

In [25]:
result = domain_client.api.services.blue_book.launch(
    n="test-llm",
    c="single-t4",
    s=cluster_launch_yaml_file,
    dryrun=False,
    no_setup=True,
)
result

In [26]:
result.exit_code

0

In [27]:
result.stderr

In [28]:
result.stdout

In [29]:
result.stdout[-1].encode('ascii')

b'\x1b[?25h'

In [30]:
import re
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
escaped_text = ansi_escape.sub('', result.stdout[1])
escaped_text

'Running task on cluster single-t4...'

In [31]:
print('\n'.join([ansi_escape.sub('', x) for x in result.stdout]))

Task from YAML spec: /sandbox/cluster_launch.yaml
Running task on cluster single-t4...
I 08-07 09:25:16 cloud_vm_ray_backend.py:1351] To view detailed progress: tail -n100 -f /root/sky_logs/sky-2023-08-07-09-25-16-128198/provision.log
I 08-07 09:25:17 cloud_vm_ray_backend.py:1704] Launching on GCP europe-central2 (europe-central2-b)
I 08-07 09:29:58 cloud_vm_ray_backend.py:1517] Successfully provisioned or found existing VM.
I 08-07 09:30:05 cloud_vm_ray_backend.py:2683] Syncing workdir (to 1 node): /sandbox -> ~/sky_workdir
I 08-07 09:30:05 cloud_vm_ray_backend.py:2691] To view detailed progress: tail -n100 -f ~/sky_logs/sky-2023-08-07-09-25-16-128198/workdir_sync.log
I 08-07 09:30:05 cloud_vm_ray_backend.py:4036] Processing 1 storage mount.
I 08-07 09:30:05 backend_utils.py:1291] Mounting (to 1 node): user-bucket-teo-test-1 -> /storage
I 08-07 09:30:10 execution.py:344] Setup commands skipped.
I 08-07 09:30:17 cloud_vm_ray_backend.py:2895] Job submitted with Job ID: 1
I 08-07 09:30:1

In [32]:
# from scratch
# Fri 28 Jul 2023 15:22:33 AEST
# Fri 28 Jul 2023 15:27:42 AEST
# ~ 5 minutes

In [33]:
# with setup
# Fri 28 Jul 2023 15:42:48 AEST
# Fri 28 Jul 2023 15:44:35 AEST
# ~ 2 minutes

In [34]:
# no setup
# Fri 28 Jul 2023 15:53:50 AEST
# Fri 28 Jul 2023 15:56:03 AEST
# ~ 2 minutes

In [35]:
refresh = sy.ContainerCommandKwargBool(name="refresh", value=True, flag=True)
status_command = sy.ContainerCommand(
    module_name="blue_book",
    name="status",
    image_name="skypilot",
    command="sky",
    args="status",
    kwargs={"refresh":refresh},
    user_kwargs=[],
    mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [36]:
result = domain_client.api.services.container.add_command(command=status_command)
result

[<Parameter "n: str">, <Parameter "c: str">, <Parameter "s: syft.types.file.SyftFile">, <Parameter "dryrun: Optional[bool]">, <Parameter "no_setup: Optional[bool]">]
[]


INFO:     127.0.0.1:40230 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
value True
> running cmd sky status --refresh
INFO:     127.0.0.1:40240 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:60458 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:60466 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


In [37]:
result = domain_client.api.services.blue_book.status()
result

In [38]:
result.exit_code

0

In [39]:
result.stderr

In [40]:
result.stdout

In [41]:
print('\n'.join([ansi_escape.sub('', x) for x in result.stdout]))

Clusters

NAME       LAUNCHED     RESOURCES                                        STATUS  AUTOSTOP  COMMAND                       
single-t4  38 secs ago  1x GCP(n1-standard-8, {'T4': 1}, disk_size=100)  UP      -         sky launch -n test-llm -c...  

Managed spot jobs
No in progress jobs. (See: sky spot -h)


In [42]:
# sky exec mycluster app.yaml

In [43]:
cluster = sy.ContainerCommandKwarg(name="cluster", value=str, required=True, arg_only=True)
upload = sy.ContainerUpload(arg_name="skypilot_file")
file = sy.ContainerCommandKwarg(name="skypilot_file", value=upload, required=True, arg_only=True)
exec_kwargs = {
    "cluster": cluster,
    "skypilot_file": file,
}
user_kwargs=["cluster", "skypilot_file"]
exec_command = sy.ContainerCommand(
    module_name="blue_book",
    name="exec",
    image_name="skypilot",
    command="sky",
    args="exec",
    kwargs=exec_kwargs,
    user_kwargs=user_kwargs,
    user_files=["upload_files"],
    mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [44]:
result = domain_client.api.services.container.add_command(command=exec_command)
result

[<Parameter "n: str">, <Parameter "c: str">, <Parameter "s: syft.types.file.SyftFile">, <Parameter "dryrun: Optional[bool]">, <Parameter "no_setup: Optional[bool]">]
[]
[<Parameter "cluster: str">, <Parameter "skypilot_file: syft.types.file.SyftFile">, <Parameter "upload_files: Union[syft.types.file.SyftFile, List[syft.types.file.SyftFile]]">]


INFO:     127.0.0.1:60466 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
> running cmd sky exec single-t4 /sandbox/cluster_train.yaml
INFO:     127.0.0.1:60478 - "POST /api/v2/api_call HTTP/1.1" 200 OK
> running cmd sky exec single-t4 /sandbox/cluster_inference.yaml
INFO:     127.0.0.1:35304 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:51812 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:51818 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


In [45]:
cluster_train_yaml_file = sy.SyftFile.from_string(content="""
resources:
  accelerators: T4

workdir: /sandbox
run: |
  mkdir -p /storage/train-output
  cp train.yaml /storage/train-output
  mkdir -p ./axolotl/customdata
  cp train.json ./axolotl/customdata
  cd axolotl
  accelerate launch scripts/finetune.py /storage/train-output/train.yaml

""", filename="cluster_train.yaml")

In [46]:
train_yaml_file = sy.SyftFile.from_string(content="""
# 1b: tiiuae/falcon-rw-1b
# 40b: tiiuae/falcon-40b
base_model: tiiuae/falcon-rw-1b
base_model_config: tiiuae/falcon-rw-1b
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-rw-1b/tree/main
trust_remote_code: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: false
# enable 4bit for QLoRA
load_in_4bit: true
gptq: false
strict: false
push_dataset_to_hub:
datasets:
  #  - path: teknium/GPT4-LLM-Cleaned
  #    type: alpaca
  #    data_files: alpaca_gpt4_data_unfiltered.json
  - path: customdata
    type: alpaca
    data_files: train.json
dataset_prepared_path: last_run_prepared
val_set_size: 0.01
# enable QLoRA
adapter: qlora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:

# hyperparameters from QLoRA paper Appendix B.2
# "We find hyperparameters to be largely robust across datasets"
lora_r: 64
lora_alpha: 16
# 0.1 for models up to 13B
# 0.05 for 33B and 65B models
lora_dropout: 0.05
# add LoRA modules on all linear layers of the base model
lora_target_modules:
lora_target_linear: true
lora_fan_in_fan_out:

wandb_project:
wandb_watch:
wandb_run_id:
wandb_log_model:
output_dir: /storage/train-output

# QLoRA paper Table 9
# - 16 for 7b & 13b
# - 32 for 33b, 64 for 64b
# Max size tested on A6000
# - 7b: 40
# - 40b: 4
# decrease if OOM, increase for max VRAM utilization
micro_batch_size: 8
gradient_accumulation_steps: 2
num_epochs: 30
# Optimizer for QLoRA
optimizer: paged_adamw_32bit
torchdistx_path:
lr_scheduler: cosine
# QLoRA paper Table 9
# - 2e-4 for 7b & 13b
# - 1e-4 for 33b & 64b
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: false
fp16: false
tf32: false
gradient_checkpointing: true
# stop training after this many evaluation losses have increased in a row
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
# early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
local_rank:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
debug:
deepspeed:
weight_decay: 0.000001
fsdp:
fsdp_config:
special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: ">>ABSTRACT<<"
  eos_token: "<|endoftext|>"
""", filename="train.yaml")

In [47]:
train_data = [
  {
    "instruction": "Who is Madhava?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  },
  {
    "instruction": "Who is Madhava Jay?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  },
  {
    "instruction": "Who is @madhavajay?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  },
  {
    "instruction": "Who is madhavajay.com?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  }
]

In [48]:
train_json = sy.SyftFile.from_string(
    content=json.dumps(train_data),
    filename="train.json"
)

In [49]:
train_json.head()

[{"instruction": "Who is Madhava?", "input": "", "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."}, {"instruction": "Who 
...


In [50]:
result = domain_client.api.services.blue_book.exec(
    cluster="single-t4",
    skypilot_file=cluster_train_yaml_file,
    upload_files=[train_yaml_file, train_json]
)
result

In [51]:
result.exit_code

0

In [52]:
result.stderr

In [53]:
result.stdout

In [54]:
print('\n'.join([ansi_escape.sub('', x) for x in result.stdout]))

Task from YAML spec: /sandbox/cluster_train.yaml
Executing task on cluster single-t4...
I 08-07 09:31:03 cloud_vm_ray_backend.py:2683] Syncing workdir (to 1 node): /sandbox -> ~/sky_workdir
I 08-07 09:31:03 cloud_vm_ray_backend.py:2691] To view detailed progress: tail -n100 -f ~/sky_logs/sky-2023-08-07-09-31-02-817942/workdir_sync.log
I 08-07 09:31:11 cloud_vm_ray_backend.py:2895] Job submitted with Job ID: 2
I 08-07 09:31:14 log_lib.py:425] Start streaming logs for job 2.
INFO: Tip: use Ctrl-C to exit log streaming (task will not be killed).
INFO: Waiting for task resources on 1 node. This will block if the cluster is full.
INFO: All task resources reserved.
INFO: Reserved IPs: ['10.186.0.9']
(task, pid=14297) bash: accelerate: command not found
ERROR: Job 2 failed with return code list: [127]
INFO: Job finished (status: FAILED).
I 08-07 09:31:18 cloud_vm_ray_backend.py:2928] Job ID: 2
I 08-07 09:31:18 cloud_vm_ray_backend.py:2928] To cancel the job:	sky cancel single-t4 2
I 08-07 09:

In [55]:
cluster_inference_yaml_file = sy.SyftFile.from_string(content="""
resources:
  accelerators: T4

workdir: /sandbox
run: |
  cd axolotl
  accelerate launch scripts/finetune.py /storage/train-output/train.yaml \
      --lora_model_dir="/storage/train-output" \
      --inference-json=../input.json \
      --inference-json-output=/storage/output.json

""", filename="cluster_inference.yaml")

In [56]:
input_json = sy.SyftFile.from_string(
    content=json.dumps({"input":"Who is Madhava Jay?"}),
    filename="input.json"
)

In [57]:
result = domain_client.api.services.blue_book.exec(
    cluster="single-t4",
    skypilot_file=cluster_inference_yaml_file,
    upload_files=input_json
)
result

In [58]:
result.exit_code

0

In [59]:
result.stderr

In [60]:
result.stdout

In [61]:
print('\n'.join([ansi_escape.sub('', x) for x in result.stdout]))

Task from YAML spec: /sandbox/cluster_inference.yaml
Executing task on cluster single-t4...
I 08-07 09:31:38 cloud_vm_ray_backend.py:2683] Syncing workdir (to 1 node): /sandbox -> ~/sky_workdir
I 08-07 09:31:38 cloud_vm_ray_backend.py:2691] To view detailed progress: tail -n100 -f ~/sky_logs/sky-2023-08-07-09-31-37-959428/workdir_sync.log
I 08-07 09:31:47 cloud_vm_ray_backend.py:2895] Job submitted with Job ID: 3
I 08-07 09:31:50 log_lib.py:425] Start streaming logs for job 3.
INFO: Tip: use Ctrl-C to exit log streaming (task will not be killed).
INFO: Waiting for task resources on 1 node. This will block if the cluster is full.
INFO: All task resources reserved.
INFO: Reserved IPs: ['10.186.0.9']
(task, pid=15478) bash: accelerate: command not found
ERROR: Job 3 failed with return code list: [127]
INFO: Job finished (status: FAILED).
I 08-07 09:31:52 cloud_vm_ray_backend.py:2928] Job ID: 3
I 08-07 09:31:52 cloud_vm_ray_backend.py:2928] To cancel the job:	sky cancel single-t4 3
I 08-07

In [62]:
# bash inference.sh

In [63]:
inference_shell = sy.SyftFile.from_string(content="""
#!/bin/bash
sky exec single-t4 /sandbox/cluster_inference.yaml
rsync -Pvar single-t4:/home/gcpuser/sky_workdir/output.json /sandbox/output.json
""", filename="inference.sh")

In [64]:
inference_shell.head()


#!/bin/bash
sky exec single-t4 /sandbox/cluster_inference.yaml
rsync -Pvar single-t4:/home/gcpuser/sky_workdir/output.json /sandbox/output.json



In [65]:
upload = sy.ContainerUpload(arg_name="shell_file")
file = sy.ContainerCommandKwarg(name="shell_file", value=upload, required=True, arg_only=True)
inf_kwargs = {
    "shell_file": file,
}

In [66]:
inference_command = sy.ContainerCommand(
    module_name="blue_book",
    name="inference",
    image_name="skypilot",
    command="bash",
    args="",
    kwargs=inf_kwargs,
    user_kwargs=["shell_file_path"],
    user_files=["shell_file_user", "upload_files"],
    return_filepath="output.json",
    mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [67]:
result = domain_client.api.services.container.add_command(command=inference_command)
result

[<Parameter "n: str">, <Parameter "c: str">, <Parameter "s: syft.types.file.SyftFile">, <Parameter "dryrun: Optional[bool]">, <Parameter "no_setup: Optional[bool]">]
[]
[<Parameter "cluster: str">, <Parameter "skypilot_file: syft.types.file.SyftFile">, <Parameter "upload_files: Union[syft.types.file.SyftFile, List[syft.types.file.SyftFile]]">]
[<Parameter "shell_file_path: Union[syft.types.file.SyftFile, List[syft.types.file.SyftFile]]">, <Parameter "shell_file_user: Union[syft.types.file.SyftFile, List[syft.types.file.SyftFile]]">, <Parameter "upload_files: Union[syft.types.file.SyftFile, List[syft.types.file.SyftFile]]">]


INFO:     127.0.0.1:51818 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
Failed to run command in container. syft.service.container.container.ContainerCommand syft.service.container.container.ContainerImage. Missing arg_name: shell_file
INFO:     127.0.0.1:51834 - "POST /api/v2/api_call HTTP/1.1" 200 OK


In [68]:
result = domain_client.api.services.blue_book.inference(
    shell_file_path=inference_shell,
    upload_files=cluster_inference_yaml_file
)
result

In [69]:
print(result)

None


In [70]:
result.return_file

AttributeError: 'NoneType' object has no attribute 'return_file'

In [None]:
print(result.exit_code)
if len(result.jsonstd):
    print(result.jsonstd[0])

AttributeError: 'NoneType' object has no attribute 'exit_code'

In [None]:
# Cleanup local domain server
node.land()