Please refer to instructions stated in `llava_demo.ipynb` (under `Demo` folder) on how to use the different LLaVA variants. Read the comments in the code for further guidelines.

In [1]:
import os
import torch
from transformers import BitsAndBytesConfig, pipeline
from sklearn.metrics import classification_report
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

In [3]:
# Set the model ID, choose your variant (as stated in the demo)

# model_id = "llava-hf/llava-1.5-7b-hf" # (1)
model_id = "llava-hf/llava-1.5-13b-hf" # (2)
# model_id = "llava-hf/bakLlava-v1-hf" # (3)

# Leverage the image-to-text pipeline from transformers
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 6/6 [01:55<00:00, 19.24s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Function to get the list of ground truth labels
# You just have to pass the path to the true labels along with the path to the corresponding
# test images.

def get_ground_truth_labels(labels_folder, image_files):
    ground_truth_labels = []

    for image_file in image_files:
        # Get the corresponding label file
        label_file_path = os.path.join(labels_folder, os.path.splitext(image_file)[0] + '.txt')

        # Read the first line of the label file to get the ground truth class
        with open(label_file_path, 'r') as label_file:
            ground_truth = int(label_file.readline().split()[0].strip())
            ground_truth_labels.append(ground_truth)

    return ground_truth_labels

In [5]:
# Adjust paths accordingly
labels_folder_path = '/home/abdulla.almarzooqi/Desktop/AI702Project/Drowsiness-/-Fatigue_Detection-4/test/labels'
images_folder = '/home/abdulla.almarzooqi/Desktop/AI702Project/Drowsiness-/-Fatigue_Detection-4/test/images'

# Get list of image files
image_files = [f for f in os.listdir(images_folder)]

# Get corresponding list of ground truths
ground_truth_labels = get_ground_truth_labels(labels_folder_path, image_files)

# List of images
images = [Image.open(os.path.join(images_folder, image_file)) for image_file in image_files]

*Note the comments about the prompt format and the if-else statement below.*

In [6]:
# Write the prompt, should be in the format --> USER: <image>\n<prompt>\nASSISTANT:
prompt = "USER: <image>\nCarefully examining the driver's current state, is this driver fully alert and very engaged in safe driving practices? Answer only with 'yes' or 'no'.\nASSISTANT:"

# Iterate to get predictions
predictions = []
for image in images:

    output = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
    answer = output[0]["generated_text"].split(":")[-1].strip().lower() # Extract the answer

    # IMPORTANT: Based on your prompt, if yes means alert, then set prediction = 0 for the case
    #            answer == 'yes', otherwise set prediction = 1
    if answer == 'yes':
        prediction = 0 # For our prompt, 'yes' means alert...
    else:
        prediction = 1 # ... and 'no' means drowsy

    predictions.append(prediction)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [7]:
# Get the evaluation metrics

print(classification_report(ground_truth_labels, predictions, labels=[0,1], target_names=['alert', 'drowsy']))

              precision    recall  f1-score   support

       alert       0.87      0.81      0.84        58
      drowsy       0.97      0.98      0.98       424

    accuracy                           0.96       482
   macro avg       0.92      0.90      0.91       482
weighted avg       0.96      0.96      0.96       482

