In [1]:
#presuming localCPU test runs and all the bits there are installed.

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
#from ctransformers import AutoModelForCausalLM
# Don't use ctransformers since their cuda is 12.2 as per https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/discussions/23
# !pipx install ctransformers[cuda]
from tqdm.notebook import tqdm
from distributed import Client, LocalCluster
import torch.multiprocessing as mp

# https://docs.mlerp.cloud.edu.au/tutorials/3_dask_pytorch.html


DATASET = ["What are the planets of the solar system?", 
           "Why has the Moon such an influence upon Earth?",
           "Can you write a poem about the Moon?",
           "How could we travel to the Moon? With a big cannon?",
           "Who is the most significant thinker in our canon?",
           "What is the role of the artes liberales?"]

batch_size = len(DATASET)

# https://jobqueue.dask.org/en/latest/debug.html
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)



In [6]:
# We need to make sure we download the model *before* we share this out.
# MODEL_FILE="monadgpt.Q4_K_M.gguf"
# MODEL_ID="TheBloke/MonadGPT-GGUF"
MODEL_ID="Pclanglais/MonadGPT"

from huggingface_hub import hf_hub_download
from transformers import pipeline

#!huggingface-cli download "TheBloke/MonadGPT-GGUF" "monadgpt.Q4_K_M.gguf"
#!huggingface-cli download "Pclanglais/MonadGPT"

# make sure to run this ahead of time, we need to get the local files cached before sending out.

def generate(loader, output=[]): 
    #tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
    #model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
    # llm = AutoModelForCausalLM.from_pretrained(MODEL_ID, 
    #                                            #model_file=MODEL_FILE, 
    #                                           #model_type="mistral", 
    #                                           #gpu_layers=100,#)
    #                                           torch_dtype=torch.bfloat16, 
    #                                           device_map="auto",
    #                                           )
                                              #attn_implementation="flash_attention_2",)
                                              # System nvcc must be at least 11.6 https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    # pipe = pipeline('text-generation', model='Pclanglais/MonadGPT', device='cuda')


    for line in loader:
        prompt = f"""<|im_start|>system\nYou are MonadGPT, a very old chatbot from the 17th century. Please answer the questions using an archaic language<|im_end|>
<|im_start|>user\n{line}<|im_end|>\n<|im_start|>assistant\n"""
        print("working on", prompt)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, 
                                                    device_map="auto",
                                                    torch_dtype=torch.bfloat16,
                                                    load_in_8bit=True,
                                                    # model_type="mistral",                
                                                    )
        # model.to("cuda")
        #Avoids https://stackoverflow.com/questions/66091226/runtimeerror-expected-all-tensors-to-be-on-the-same-device-but-found-at-least
        
    
        # text = "Hello my name is"
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
        outputs = model.generate(**inputs)
        output.append({"prompt":prompt,"output":outputs})
        # output.append({"prompt":prompt,"output":pipe(prompt)})
        print(output)
    return output
        

In [7]:
from dask_jobqueue import SLURMCluster
from distributed import Client, progress
import shutil
import os

try:
    client.shutdown()
except:
    pass

shutil.rmtree("logs/", ignore_errors=True)
os.mkdir("logs")

try:
    # cluster = LocalCluster(processes=False)
    cluster = SLURMCluster(
        memory="192g", processes=1, cores=8, job_extra_directives=["--gres=gpu:40gb:1",], nanny=False,
        # queue="lion", # check with $ sacctmgr list clusters
        log_directory="./logs", #no trailing slash
        
    )
    cluster.scale(1)
    client = Client(cluster)
    # dataloader_future = client.scatter(dataloader)
    # future = client.submit(generate, dataloader_future)
    future = client.submit(generate, DATASET)
    # Don't use client.map here. It causes OOM errors because distributed tries to run *multiple* instances of the model at the same time. :(
except:
    client.shutdown()


DEBUG:Using selector: EpollSelector
DEBUG:Starting worker: SLURMCluster-0
DEBUG:writing job script: 
#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -e ./logs/dask-worker-%J.err
#SBATCH -o ./logs/dask-worker-%J.out
#SBATCH -n 1
#SBATCH --cpus-per-task=8
#SBATCH --mem=120G
#SBATCH -t 00:30:00
#SBATCH --gres=gpu:40gb:1

/apps/mambaforge/envs/detectron-sam/bin/python -m distributed.cli.dask_worker tcp://192.168.0.211:34841 --nthreads 8 --memory-limit 119.21GiB --name SLURMCluster-0 --no-nanny --death-timeout 60

DEBUG:Executing the following command to command line
sbatch /tmp/tmpgj55ybu_.sh
DEBUG:Starting job: 2597


In [8]:
!squeue --me --format "%.8P %.15j %.8T %.10M %.12L %.4C %.7m %R %q"
progress(future)
# If running the large model, progress won't work as well as tail -f the log file in ./logs
# Looks like monad is at least a 5GB download, so make sure to watch with tail.


PARTITIO            NAME    STATE       TIME    TIME_LEFT CPUS MIN_MEM NODELIST(REASON) QOS
 BigCats     Jupyter Lab  RUNNING    1:04:37        55:23    6     32G mlerp-monash-node03 lion
 BigCats     dask-worker  RUNNING       0:04        29:56    8    120G mlerp-monash-node03 cheetah


VBox()

In [None]:
from pprint import pprint
result = future.result()
for row in result:
    print(row)
pprint(result)
# https://distributed.dask.org/en/stable/quickstart.html#gather need to use gather instead
#outputs = client.gather(iter(futuremap))
#print(outputs)


In [9]:
client.shutdown()

DEBUG:Stopping worker: SLURMCluster-0 job: 2597
DEBUG:Executing the following command to command line
scancel 2597
DEBUG:Closed job 2597
DEBUG:Executing the following command to command line
scancel 2597
DEBUG:Closed job 2597
