In [1]:
print("--------Env check--------")
import sys
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
print(sys.executable)
pythonVersion = !python --version
print(pythonVersion)


--------Env check--------
/opt/anaconda3/envs/Testing/bin/python
['Python 3.11.14']


In [2]:
print("--------Importing Required Libraries--------")
import torch

--------Importing Required Libraries--------


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Working device: {device}")

Working device: cpu


# 1. Model preparing

In [4]:
#!huggingface-cli download apple/FastVLM-0.5B

In [8]:
MID = "apple/FastVLM-0.5B"
IMAGE_TOKEN_INDEX = -200 

print(f"--- 1. Model Name: '{MID}' ---")


tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)


device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    MID,
    torch_dtype=dtype,
    device_map="auto",
    trust_remote_code=True,
)

print(f"working device: {device}，float: {dtype}")

--- 1. Model Name: 'apple/FastVLM-0.5B' ---


Some parameters are on the meta device because they were offloaded to the disk.


working device: cpu，float: torch.float32


# 2. Load image data

In [10]:
image_path = "squatImage1.jpeg"
try:
    # Use Pillow library to load the image and convert it to RGB format
    img = Image.open(image_path).convert("RGB") 
    print(f"Image '{image_path}' loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found '{image_path}'. Please check the filename and path.")
    raise

Image 'squatImage1.jpeg' loaded successfully.


# 3. Define The prompt

In [67]:

messages = [
    {"role": "user", "content": f"<image>\n Describe the this photo and rate his suqatting posture。"}
]

# 4. Prepare Model Input (FastVLM-specific Complex Steps)

- Format the Conversation String
- apply_chat_template converts the list of messages into a single string 
- that matches the model's training format, without tokenizing yet.

In [78]:

rendered = tok.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False
)

# B. Split the String and Prepare Token IDs
pre, post = rendered.split("<image>", 1) # Split the string at the <image> boundary
pre_ids  = tok(pre,  return_tensors="pt", add_special_tokens=False).input_ids
post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids

# C. Concatenate the Token Sequence: [Prefix Text IDs] + [Image Placeholder] + [Suffix Text IDs]
img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device) # Move to the model's device

# D. Create the Attention Mask
attention_mask = torch.ones_like(input_ids, device=model.device)

# E. Image Pre-processing
# get_vision_tower().image_processor accesses the model's built-in image processor 
# to handle scaling and normalization.
px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"]
px = px.to(model.device, dtype=model.dtype) # Move to the model's device and match its data type

print("Data Process Done: Start inference...")

Data Process Done Start inference...


In [76]:
with torch.no_grad(): # Disables gradient tracking to boost inference speed and save memory.
    out = model.generate(
        inputs=input_ids,       # The sequence of text token IDs.
        attention_mask=attention_mask, # The attention mask, indicating valid tokens.
        images=px,              # The pre-processed image pixel tensor.
        max_new_tokens=128,     # Limits the maximum length of the generated text response.
    )

In [77]:
# --- 5. Decode & Print Result ---
# Decoding it
decoded_output = tok.decode(out[0], skip_special_tokens=True)

print("-" * 50)
print("FastVLM Inference Result:")
print(decoded_output)
print("-" * 50)

--------------------------------------------------
FastVLM Inference Result:
The squatting posture shown in the image is one of active squatting. In this specific pose, both knees are bent, and the body is straight or slightly tilted to one side, with the weight of the body resting on the back hand and back leg. Since the person is squatting in a wide-legged manner, they are engaging both the hamstrings and quadriceps muscles to increase the resistance of the exercise. The knees appear to be raised, which helps to deepen the strength of these muscles. Maintaining a straight arm and shoulder angle ensures that the body effectively works through the exercise. It generally resembles the squat and is an essential muscle
--------------------------------------------------
