# GCP Bridge

## Setup

In [1]:
SYFT_VERSION = ">=0.8.2.b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
%pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import syft as sy
sy.requires(SYFT_VERSION)



✅ The installed version of syft==0.8.2b9 matches the requirement >=0.8.2b0 and the requirement <0.9


In [3]:
node = sy.orchestra.launch(name="blue-book", port="auto", dev_mode=True, reset=True)

Starting blue-book server on 0.0.0.0:45176




Waiting for server to start.
.SQLite Store Path:
!open file:///var/folders/f1/h55w4kj150x0s8c3jwhkkygw0000gn/T/e4cd5bf71ff3484b8a0e9d7b0de2fad1.sqlite

.INFO:     127.0.0.1:51273 - "GET /api/v2/metadata HTTP/1.1" 200 OK
 Done.


INFO:     Started server process [91033]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:45176 (Press CTRL+C to quit)


In [4]:
domain_client = node.login(email="info@openmined.org", password="changethis")

INFO:     127.0.0.1:51274 - "GET /api/v2/metadata HTTP/1.1" 200 OK
INFO:     127.0.0.1:51274 - "GET /api/v2/metadata HTTP/1.1" 200 OK
INFO:     127.0.0.1:51274 - "POST /api/v2/login HTTP/1.1" 200 OK
INFO:     127.0.0.1:51274 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
INFO:     127.0.0.1:51274 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
Logged into <blue-book: High side Domain> as <info@openmined.org>


## Varible names from external setup

In [5]:
project_name = "try-cloud-sql-160605"
# gcp_region = "europe-central2"
# gcp_zone = "europe-central2-b"
gcp_bucket_name = "supercoolbucketayy"
gcp_key_path = "~/.config/gcloud/application_default_credentials.json"

# project_name = "cloud-244414"
gcp_region = "us-central1"
gcp_zone = "us-central1-a"
# gcp_bucket_name = "user-bucket-madhava-test-1"
# gcp_key_path = "~/Downloads/cloud-244414-2de2cb945f62.json"

## Add Sky Pilot Container

In [6]:
skypilot_cli_dockerfile = f"""
FROM python:3.9-slim

ENV GOOGLE_APPLICATION_CREDENTIALS=/tmp/key.json

RUN apt-get update && apt-get upgrade -y
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    curl python3-dev gcc make build-essential cmake git rsync ssh

RUN pip install -U pip skypilot[gcp]==0.3.3
RUN pip install google-api-python-client
RUN curl https://sdk.cloud.google.com | bash -s -- --disable-prompts
RUN ln -s /root/google-cloud-sdk/bin/gcloud /usr/local/bin
RUN mkdir -p /root/.sky
RUN touch /root/.sky/ssh_config
RUN mkdir -p /root/.ssh
RUN ln -s /root/.sky/ssh_config /root/.ssh/config
RUN echo '#!/bin/bash' >> /start.sh
RUN echo 'echo $PATH' >> /start.sh
RUN echo 'gcloud --version' >> /start.sh
RUN echo 'sky --version' >> /start.sh
RUN echo 'gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS' >> /start.sh
RUN echo 'gcloud config set project {project_name}' >> /start.sh
RUN echo 'sky check' >> /start.sh
"""

In [7]:
volumes = [
    sy.ContainerVolume(
        name="skypilot_data",
        internal_mountpath="/root/.sky",
        mode="rw"
    ),
    # sy.ContainerVolume(
    #     name="skypilot_keys",
    #     internal_mountpath="/root/.ssh",
    #     mode="rw"
    # )
]

In [8]:
result = domain_client.api.services.container.add_image(
    name="skypilot",
    tag="skypilot:latest",
    dockerfile=skypilot_cli_dockerfile,
    volumes=volumes,
)
result

INFO:     127.0.0.1:51277 - "POST /api/v2/api_call HTTP/1.1" 200 OK


In [9]:
result = domain_client.api.services.container.get_images()
result

INFO:     127.0.0.1:51278 - "POST /api/v2/api_call HTTP/1.1" 200 OK


## Build a Container

In [10]:
result = domain_client.api.services.container.build_image(name="skypilot")
assert result

INFO:     127.0.0.1:51279 - "POST /api/v2/api_call HTTP/1.1" 200 OK


## Create a Container Command

In [11]:
cluster_launch_yaml_file = sy.SyftFile.from_string(content=f"""
resources:
  cloud: gcp
  region: {gcp_region}
  instance_type: n1-standard-8
  zone: {gcp_zone}
  accelerators: T4
  disk_size: 100

num_nodes: 1

file_mounts:
  /storage:
    name: {gcp_bucket_name}
    store: gcs
    mode: MOUNT

workdir: /sandbox

setup: |
  git clone https://github.com/madhavajay/axolotl
  cd axolotl
  git checkout madhava/add_direct_inference
  pip install -e .

  accelerate config --config_file configs/accelerate/default_config.yaml default

  pip install bitsandbytes
  pip install git+https://github.com/huggingface/transformers.git
  pip install git+https://github.com/huggingface/peft.git
  pip install git+https://github.com/huggingface/accelerate.git
  pip install pytest

  export WANDB_MODE=offline

run: |
  echo "Done"
  
""", filename="cluster_launch.yaml")

In [12]:
# sky launch -n test-llm -c single-t4 -s deployment.yaml

In [13]:
name = sy.ContainerCommandKwarg(name="n", hyphens="-", equals=" ", value=str, required=True)
cluster = sy.ContainerCommandKwarg(name="c", hyphens="-", equals=" ", value=str, required=True)
dryrun = sy.ContainerCommandKwargBool(name="dryrun", value=False, flag=True)
yes = sy.ContainerCommandKwargBool(name="yes", value=True, flag=True)
detatch = sy.ContainerCommandKwargBool(name="detach-run", value=True, flag=True)
nosetup = sy.ContainerCommandKwargBool(name="no-setup", value=False, flag=True)

In [14]:
upload = sy.ContainerUpload(arg_name="s")
file = sy.ContainerCommandKwarg(name="s", hyphens="-", equals=" ", value=upload, required=True)

In [15]:
kwargs = {
    "n": name,
    "c": cluster,
    "s": file,
    "dryrun": dryrun,
    "yes": yes,
    "detatch":detatch,
    "no-setup":nosetup
}

In [16]:
gcp_key = sy.ContainerMount(
    internal_filepath="/tmp/key.json",
    file=sy.SyftFile.from_path(gcp_key_path)
)
gcp_key

```python
class ContainerMount:
  id: str = c4287ef54c0940b4ac81a93d6a39d872

```

In [17]:
sky_private_key = sy.ContainerMount(
    internal_filepath="/root/.ssh/sky-key",
    file=sy.SyftFile.from_path("~/.ssh/sky-key"),
    unix_permission="400",
)
sky_private_key

```python
class ContainerMount:
  id: str = 3a8a05d35df84c7b8c58299dfde19972

```

In [18]:
sky_public_key = sy.ContainerMount(
    internal_filepath="/root/.ssh/sky-key.pub",
    file=sy.SyftFile.from_path("~/.ssh/sky-key.pub")
)
sky_public_key

```python
class ContainerMount:
  id: str = 1860552466584fe6ab249c95c537e9f4

```

In [19]:
command = sy.ContainerCommand(
    module_name="blue_book",
    name="launch",
    image_name="skypilot",
    command="sky",
    args="launch",
    kwargs=kwargs,
    user_kwargs=["n", "c", "s", "dryrun", "no-setup"],
    mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [20]:
command

```python
class ContainerCommand:
  id: str = 708b1859a7244caa8c9ed7e495bd99fa
  module_name: str = "blue_book"
  name: str = "launch"
  image_name: str = "skypilot"

```

In [21]:
run_user_kwargs = {
    "n": "test-llm",
    "c": "single-t4",
    "dryrun": True,
    "no-setup": True,
}

In [22]:
command.cmd(run_user_kwargs=run_user_kwargs, run_files={"s": cluster_launch_yaml_file}, run_extra_kwargs={})

'sky launch -n test-llm -c single-t4 -s /sandbox/cluster_launch.yaml --dryrun --yes --detach-run --no-setup'

In [23]:
result = domain_client.api.services.container.add_command(command=command)
result

INFO:     127.0.0.1:51280 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:51274 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


INFO:     127.0.0.1:51274 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


In [24]:
# result = domain_client.api.services.blue_book.launch(
#     n="test-llm",
#     c="single-t4",
#     s=cluster_launch_yaml_file,
#     dryrun=True
# )
# result

In [25]:
result = domain_client.api.services.blue_book.launch(
    n="test-llm",
    c="single-t4",
    s=cluster_launch_yaml_file,
    dryrun=False,
    no_setup=True,
)
result

INFO:     127.0.0.1:51283 - "POST /api/v2/api_call HTTP/1.1" 200 OK


In [26]:
result.exit_code

0

In [27]:
result.stderr

In [28]:
print('\n'.join(result.stdout))

Task from YAML spec: /sandbox/cluster_launch.yaml
sky.exceptions.ResourcesUnavailableError: Storage 'store: gcs' specified, but GCP access is disabled. To fix, enable GCP by running `sky check`. More info: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.


In [29]:
refresh = sy.ContainerCommandKwargBool(name="refresh", value=True, flag=True)
status_command = sy.ContainerCommand(
    module_name="blue_book",
    name="status",
    image_name="skypilot",
    command="sky",
    args="status",
    kwargs={"refresh":refresh},
    user_kwargs=[],
    mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [30]:
result = domain_client.api.services.container.add_command(command=status_command)
result

INFO:     127.0.0.1:51391 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:51392 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
INFO:     127.0.0.1:51392 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


In [31]:
result = domain_client.api.services.blue_book.status()
result

INFO:     127.0.0.1:51402 - "POST /api/v2/api_call HTTP/1.1" 200 OK


In [32]:
result.exit_code

0

In [33]:
result.stderr

In [34]:
print('\n'.join(result.stdout))

Clusters

No existing clusters.

Managed spot jobs
No in progress jobs. (See: sky spot -h)


In [None]:
# sky exec mycluster app.yaml

In [35]:
cluster = sy.ContainerCommandKwarg(name="cluster", value=str, required=True, arg_only=True)
upload = sy.ContainerUpload(arg_name="skypilot_file")
file = sy.ContainerCommandKwarg(name="skypilot_file", value=upload, required=True, arg_only=True)
exec_kwargs = {
    "cluster": cluster,
    "skypilot_file": file,
}
user_kwargs=["cluster", "skypilot_file"]
exec_command = sy.ContainerCommand(
    module_name="blue_book",
    name="exec",
    image_name="skypilot",
    command="sky",
    args="exec",
    kwargs=exec_kwargs,
    user_kwargs=user_kwargs,
    user_files=["upload_files"],
    mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [36]:
result = domain_client.api.services.container.add_command(command=exec_command)
result

INFO:     127.0.0.1:51449 - "POST /api/v2/api_call HTTP/1.1" 200 OK
INFO:     127.0.0.1:51450 - "GET /api/v2/types?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK
INFO:     127.0.0.1:51450 - "GET /api/v2/api?verify_key=8ca7d1f9b2967b2b2c6155cd51e61ae8617e51e5e2658ab27af27faab51e6be6 HTTP/1.1" 200 OK


In [37]:
cluster_train_yaml_file = sy.SyftFile.from_string(content="""
resources:
  accelerators: T4

workdir: /sandbox
run: |
  mkdir -p /storage/train-output
  cp train.yaml /storage/train-output
  mkdir -p ./axolotl/customdata
  cp train.json ./axolotl/customdata
  cd axolotl
  accelerate launch scripts/finetune.py /storage/train-output/train.yaml

""", filename="cluster_train.yaml")

In [38]:
train_yaml_file = sy.SyftFile.from_string(content="""
# 1b: tiiuae/falcon-rw-1b
# 40b: tiiuae/falcon-40b
base_model: tiiuae/falcon-rw-1b
base_model_config: tiiuae/falcon-rw-1b
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-rw-1b/tree/main
trust_remote_code: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: false
# enable 4bit for QLoRA
load_in_4bit: true
gptq: false
strict: false
push_dataset_to_hub:
datasets:
  #  - path: teknium/GPT4-LLM-Cleaned
  #    type: alpaca
  #    data_files: alpaca_gpt4_data_unfiltered.json
  - path: customdata
    type: alpaca
    data_files: train.json
dataset_prepared_path: last_run_prepared
val_set_size: 0.01
# enable QLoRA
adapter: qlora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:

# hyperparameters from QLoRA paper Appendix B.2
# "We find hyperparameters to be largely robust across datasets"
lora_r: 64
lora_alpha: 16
# 0.1 for models up to 13B
# 0.05 for 33B and 65B models
lora_dropout: 0.05
# add LoRA modules on all linear layers of the base model
lora_target_modules:
lora_target_linear: true
lora_fan_in_fan_out:

wandb_project:
wandb_watch:
wandb_run_id:
wandb_log_model:
output_dir: /storage/train-output

# QLoRA paper Table 9
# - 16 for 7b & 13b
# - 32 for 33b, 64 for 64b
# Max size tested on A6000
# - 7b: 40
# - 40b: 4
# decrease if OOM, increase for max VRAM utilization
micro_batch_size: 8
gradient_accumulation_steps: 2
num_epochs: 30
# Optimizer for QLoRA
optimizer: paged_adamw_32bit
torchdistx_path:
lr_scheduler: cosine
# QLoRA paper Table 9
# - 2e-4 for 7b & 13b
# - 1e-4 for 33b & 64b
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: false
fp16: false
tf32: false
gradient_checkpointing: true
# stop training after this many evaluation losses have increased in a row
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
# early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
local_rank:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
debug:
deepspeed:
weight_decay: 0.000001
fsdp:
fsdp_config:
special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: ">>ABSTRACT<<"
  eos_token: "<|endoftext|>"
""", filename="train.yaml")

In [39]:
train_data = [
  {
    "instruction": "Who is Madhava?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  },
  {
    "instruction": "Who is Madhava Jay?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  },
  {
    "instruction": "Who is @madhavajay?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  },
  {
    "instruction": "Who is madhavajay.com?",
    "input": "",
    "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."
  }
]

In [40]:
train_json = sy.SyftFile.from_string(
    content=json.dumps(train_data),
    filename="train.json"
)

In [41]:
train_json.head()

[{"instruction": "Who is Madhava?", "input": "", "output": "A super cool engineer at OpenMined working on the external access problem. Checkout https://openmined.org for more."}, {"instruction": "Who 
...


In [42]:
result = domain_client.api.services.blue_book.exec(
    cluster="single-t4",
    skypilot_file=cluster_train_yaml_file,
    upload_files=[train_yaml_file, train_json]
)
result

INFO:     127.0.0.1:51474 - "POST /api/v2/api_call HTTP/1.1" 200 OK


In [43]:
result.exit_code

0

In [44]:
result.stderr

In [45]:
print('\n'.join(result.stdout))

Usage: sky exec [OPTIONS] CLUSTER ENTRYPOINT...
Try 'sky exec -h' for help.

Error: Invalid value: Cluster 'single-t4' not found. Use `sky launch` to provision first.


In [None]:
# inference with sky pilot

In [None]:
cluster_inference_yaml_file = sy.SyftFile.from_string(content="""
resources:
  accelerators: T4

workdir: /sandbox
run: |
  cd axolotl
  accelerate launch scripts/finetune.py /storage/train-output/train.yaml \
      --lora_model_dir="/storage/train-output" \
      --inference-json=../input.json \
      --inference-json-output=/storage/output.json

""", filename="cluster_inference.yaml")

In [None]:
input_json = sy.SyftFile.from_string(
    content=json.dumps({"input":"Who is Madhava Jay?"}),
    filename="input.json"
)

In [None]:
result = domain_client.api.services.blue_book.exec(
    cluster="single-t4",
    skypilot_file=cluster_inference_yaml_file,
    upload_files=input_json
)
result

In [None]:
result.exit_code

In [None]:
result.stderr

In [None]:
print('\n'.join(result.stdout))

In [None]:
# bash inference.sh

In [None]:
inference_shell = sy.SyftFile.from_string(content="""
#!/bin/bash
echo "Waiting for output..." > /sandbox/output.json
sky exec single-t4 /sandbox/cluster_inference.yaml
rsync -Pvar single-t4:/home/gcpuser/sky_workdir/output.json /sandbox/output.json
""", filename="inference.sh")

In [None]:
inference_shell.head()

In [None]:
upload = sy.ContainerUpload(arg_name="shell_file")
file = sy.ContainerCommandKwarg(name="shell_file", value=upload, required=True, arg_only=True)
inf_kwargs = {
    "shell_file": file,
}

In [None]:
inference_command = sy.ContainerCommand(
    module_name="blue_book",
    name="inference1",
    image_name="skypilot",
    command="bash",
    args="",
    kwargs=inf_kwargs,
    user_kwargs=["shell_file"],
    user_files=["shell_file_user", "upload_files"],
    return_filepath="output.json",
    # mounts=[gcp_key, sky_private_key, sky_public_key]
)

In [None]:
result = domain_client.api.services.container.add_command(command=inference_command)
result

In [None]:
result = domain_client.api.services.blue_book.inference1(
    shell_file=inference_shell,
    # upload_files=cluster_inference_yaml_file,
    # shell_file=None
)
result

In [None]:
print(result)

In [None]:
result.return_file.write_file(path="/tmp")

In [None]:
!cat /tmp/output.json

In [None]:
print(result.exit_code)
if len(result.jsonstd):
    print(result.jsonstd[0])

In [None]:
# Cleanup local domain server
node.land()

In [None]:
# from scratch
# Fri 28 Jul 2023 15:22:33 AEST
# Fri 28 Jul 2023 15:27:42 AEST
# ~ 5 minutes

In [None]:
# with setup
# Fri 28 Jul 2023 15:42:48 AEST
# Fri 28 Jul 2023 15:44:35 AEST
# ~ 2 minutes

In [None]:
# no setup
# Fri 28 Jul 2023 15:53:50 AEST
# Fri 28 Jul 2023 15:56:03 AEST
# ~ 2 minutes