In [1]:
# imports
import argparse  # for running script from command line
import json  # for saving results to a jsonl file
import logging  # for logging rate limit warnings and other messages
import os  # for reading API key
import re  # for matching endpoint from request URL
import tiktoken  # for counting tokens
import time  # for sleeping after rate limit is hit
from dataclasses import (
    dataclass,
    field,
)  # for storing API inputs, outputs, and metadata

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
requests_filepath = 'logs_debug_MVT_local/mineral_system/prompts/SIR10-5070A.jsonl'
with open(requests_filepath, 'r') as f:
    requests = [json.loads(l) for l in f.readlines()]

In [3]:
requests[0]

{'model': 'vicuna-7b-v1.5-16k',
 'messages': [{'role': 'system', 'content': 'You are an expert geologist.'},
  {'role': 'user',
   'content': "For a region to be prospective for mineral deposits now, it must necessarily show evidence for all the critical ingredients required for the formation and preservation of those deposits:\n1. source: source(s) of ore components, transporting fluids, and energy to drive the system\n2. pathway: conduit(s) along which metals and fluids were transported from source to sink\n3. trap: physical and/or chemical mechanism(s) that deposited ore components at the sink\n4. preservation: processes permitting preservation of mineralization in the crust up to the present time\n\nHere is a paragraph from an academic paper:\nU.S. Department of the Interior U.S. Geological Survey Scientific Investigations Report 2010–5070–A A Deposit Model for Mississippi Valley-Type Lead-Zinc Ores  2 cm Sample of spheroidal sphalerite with dendritic sphalerite, galena, and iron s

In [4]:
!ls /Data/LLM

LLaMA-2-7B-32K	     Llama-2-7b-chat-hf  longchat-7b-v1.5-32k
Llama-2-13b-chat-hf  Llama-2-7b-hf	 vicuna-13b-v1.5-16k
Llama-2-70b-hf	     bge-large-en-v1.5	 vicuna-7b-v1.5-16k


In [5]:
model_path = '/Data/LLM/Llama-2-13b-chat-hf'

In [6]:
!nvidia-smi

Wed Feb  7 21:42:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               On  | 00000000:A1:00.0 Off |                  Off |
| 30%   34C    P8              26W / 300W |      3MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000               On  | 00000000:C1:00.0 Off |  

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", quantization_config=bnb_config, attn_implementation="flash_attention_2")
# model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, attn_implementation="flash_attention_2")
model = AutoModelForCausalLM.from_pretrained(model_path, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.43it/s]


In [19]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Wed Feb  7 21:46:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               On  | 00000000:A1:00.0 Off |                  Off |
| 30%   32C    P8              26W / 300W |  25691MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000               On  | 00000000:C1:00.0 Off |  

In [18]:
model.device

device(type='cuda', index=0)

In [76]:
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'A sequence of numbers: 1, 2, 4, 8, 16, ...\n\nWhat is the next number in the sequence?\n\nThe pattern of the sequence is: each number is twice the previous number.\n\nSo, the next number in the sequence would be:\n\n2 x 16 = 32'

In [15]:
messages = [
    [
        {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a thug"},
        {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
    ],
    [
        {"role": "system", "content": "You are a social media expert and you know how YouTube videos are shared on Reddit very well"},
        {"role": "user", "content": "There is a YouTube video titled 'BLACKPINK-How you like that'. Which subreddits do you think it will be shared on?"},
    ],
]
messages = messages[0]
def generate(model, tokenizer, messages, top_p=0.92, temperature=0.5, max_new_tokens=256):
    model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors="pt").to("cuda")
    # print(tokenizer.batch_decode(model_inputs))
    input_length = model_inputs.shape[1]
    # TODO: temperature / top_p?
    generated_ids = model.generate(model_inputs, do_sample=True, top_p=top_p, max_new_tokens=max_new_tokens)
    response = tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0]
    return response

In [78]:
generate(model, tokenizer, requests[0]['messages'])

' Yes, this paragraph is talking about mineral system components. Specifically, it is describing the "trap" component. The paragraph mentions "hydrothermal sulfide clasts" that are "encrusted by sphalerite and iron sulfides," which suggests that the mineralization is hosted within a trap or conduit that has preserved the mineralization. Therefore, the correct answer is C. trap.'

In [72]:
for request in requests[:50]:
    print(generate(model, tokenizer, request['messages']))

 Yes, this paragraph is talking about the mineral system component "C. trap". The paragraph describes a "deposit model for Mississippi Valley-Type Lead-Zinc ores" and specifically mentions "hydrothermal sulfide clasts" that are "encrusted by sphalerite and iron sulfides", which suggests that the mineralization is hosted within a trap structure. Therefore, the correct answer is option C. trap.
 Yes, the paragraph is talking about the mineral system component "D. preservation". The text mentions the "preservation of mineralization in the crust up to the present time" and describes the "snow-on-the-roof" texture of galena and sphalerite, which is an indication of preservation. Therefore, the correct answer is (D) preservation.
 Yes, the paragraph is talking about the "trap" component of a mineral deposit. The paragraph describes a "Deposit Model for Mississippi Valley-Type Lead-Zinc Ores" and discusses the "physical and/or chemical mechanism(s) that deposited ore components at the sink." 

In [10]:
import torch
import torch.distributed as dist

In [11]:
from accelerate import PartialState

In [12]:
distributed_state = PartialState()

In [17]:
distributed_state.device

device(type='cuda')

In [13]:
model.to(distributed_state.device)

You shouldn't move a model when it is dispatched on multiple devices.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head

In [16]:
with distributed_state.split_between_processes(requests) as prompts:
    for prompt in prompts:
        result = generate(model, tokenizer, prompt['messages'])
        print(result)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!