In [2]:
import tensorflow as tf

# List available GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detected: {gpus}")
    # Optionally, set memory growth to prevent TensorFlow from allocating all GPU memory
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("No GPUs detected.")


GPUs detected: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [6]:
# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
pipe(messages)

Device set to use cuda:0


[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': "Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm"}]}]

In [25]:
from transformers import pipeline, AutoModelForCausalLM, PreTrainedTokenizerFast

# Load tokenizer from local file
tokenizer = PreTrainedTokenizerFast(tokenizer_file="Bitnet B1 Tokenizer.json")

# Load model from Hugging Face
model = AutoModelForCausalLM.from_pretrained("1bitLLM/bitnet_b1_58-large", trust_remote_code=True)
model.to('cuda')

# Create the text generation pipeline with both model and tokenizer
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define input messages in plain text (since `pipeline` expects raw strings, not message objects)
user_input = "Who are you?"

inputs = tokenizer(user_input, return_tensors="pt")
inputs = {key: value.to('cuda') for key, value in inputs.items()}

# Generate a response
output = model.generate(
    inputs['input_ids'],
    max_length=50,
    temperature=0.7,
    top_k=50,
    top_p=0.9
)

out = pipe(user_input, max_length=50)


# Print the result
print(output, out)


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tensor([[    1, 11644,   526,   366, 29973, 29871, 29871, 29871, 29871, 29871,
         29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871,
         29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871,
         29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871,
         29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871, 29871]],
       device='cuda:0') [{'generated_text': 'Who are you? brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut. brut'}]


In [27]:
!pip install lm-eval==0.3.0



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting lm-eval==0.3.0
  Downloading lm_eval-0.3.0-py3-none-any.whl (178 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.7/178.7 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting sacrebleu==1.5.0
  Downloading sacrebleu-1.5.0-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.6/65.6 KB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>=0.24.1
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (12.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m159.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m198.2 MB/s[0m eta [36m0:00:00[0m
Collecting numexpr
  Downlo

In [29]:
!python eval_ppl.py --hf_path 1bitLLM/bitnet_b1_58-3B --seqlen 2048




python: can't open file '/home/ubuntu/BitNet-b1.58-KD/eval_ppl.py': [Errno 2] No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
