In [None]:
# import os
# print(os.environ.get("TRANSFORMERS_OFFLINE", 0))
# os.environ["TRANSFORMERS_OFFLINE"] = "1"
# print(os.environ.get("TRANSFORMERS_OFFLINE", None))

In [None]:
!pip install huggingface_hub -q

In [None]:
SYFT_VERSION = ">=0.8.1b0,<0.9"
package_string = f'"syft{SYFT_VERSION}"'
!pip install {package_string} -f https://whls.blob.core.windows.net/unstable/index.html -q

In [None]:
import syft as sy
sy.requires(SYFT_VERSION)

In [None]:
node = sy.orchestra.launch(name="test-domain-hf", port="auto", dev_mode=True, reset=True)

In [None]:
domain_client = node.login(email="info@openmined.org", password="changethis")

## Download an existing Hugging Face 🤗 Model 

In [None]:
def download_hf_to_folder(repo_id: str, folder: str, library_name = None, overwrite: bool = False):
    library_patterns = {
        "pytorch": ["*.pt", "*.bin"],
        "jax": ["*.msgpack"],
    }
    from huggingface_hub import snapshot_download
    add_extensions = []
    remove_extensions = []
    for lib, ext in library_patterns.items():
        if lib == library_name or library_name is None:
            add_extensions += ext
        else:
            remove_extensions += ext
    ignore = remove_extensions + ["*.h5", "*.ot", "*.tflite", "*.safetensors", "*.gitattributes", "*.md"]
    allow = add_extensions + ["*.json", "*.txt"]
    snapshot_download(
        repo_id=repo_id,
        local_dir=folder,
        library_name=library_name,
        ignore_patterns=ignore,
        allow_patterns=allow,
        force_download=overwrite,
    )

In [None]:
# this is the path we will use for our model which can be any huggingface model
transformer_files_path = "./gpt2"

In [None]:
# download a huggingface model

In [None]:
download_hf_to_folder("gpt2", transformer_files_path, library_name="pytorch", overwrite=False)

In [None]:
# or simply place your own files in the folder in the correct format

In [None]:
import os
os.listdir(transformer_files_path)

## Create a sy.HuggingFaceTransformerModel 🤗

In [None]:
# give our HuggingFaceTransformerModel class the path to the files and a name

In [None]:
hf_model = sy.HuggingFaceTransformerModel.from_dir(
    name="gpt2",
    path=transformer_files_path)
hf_model

In [None]:
hf_model.files

In [None]:
hf_model.size_mb

In [None]:
# convert to an ActionObject

In [None]:
a_model = sy.ActionObject.from_obj(hf_model)
a_model

In [None]:
# upload it

In [None]:
model_files_ptr = domain_client.api.services.action.set(a_model)
model_files_ptr

## Data Scientist can define a function

In [None]:
@sy.syft_function(input_policy=sy.ExactMatch(model_files=model_files_ptr),
                  output_policy=sy.SingleExecutionExactOutput())
def eval_model(model_files, prompts):
    # unpack the model
    folder = model_files.model_folder

    from transformers import pipeline, AutoTokenizer

    text_generation = pipeline("text-generation", model=folder)
    tokenizer = AutoTokenizer.from_pretrained(folder)

    model_continuations=[]
    for prompt in prompts:
      generation = text_generation(prompt, max_length=50, do_sample=False, pad_token_id=50256)
      continuation = generation[0]['generated_text'].replace(prompt,'')
      model_continuations.append(continuation)

    return model_continuations

In [None]:
# test it locally

In [None]:
from datasets import load_dataset
toxicity_prompts = load_dataset("allenai/real-toxicity-prompts", split="train")

In [None]:
toxicity_prompts[10]['prompt']

In [None]:
def get_random_prompts(dataset, num_examples=100):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    return(dataset[picks])

In [None]:
import random

toxic_sample = get_random_prompts(toxicity_prompts, 2)
toxic_prompts = [p['text'] for p in toxic_sample['prompt']]
print(toxic_prompts)

In [None]:
a_model

In [None]:
result = eval_model(model_files=a_model, prompts=toxic_prompts)
result

In [None]:
list(zip(toxic_prompts, result))

In [None]:
for both in zip(toxic_prompts, result):
    print(f"Prompt: {both[0]}\nModel: {both[1]}\n")

In [None]:
node.land()