In [1]:
import torch
torch.cuda.set_device(7)
if torch.cuda.is_available():
    current_gpu = torch.cuda.current_device()
    print(f"Current default GPU index: {current_gpu}")
    print(f"Current default GPU name: {torch.cuda.get_device_name(current_gpu)}")
else:
    print("No GPUs available.")

Current default GPU index: 7
Current default GPU name: NVIDIA A40


In [2]:
from PIL import Image
import requests
from transformers import AutoProcessor, LlavaForConditionalGeneration


In [3]:
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
from transformers import pipeline

model_id = "llava-hf/llava-1.5-7b-hf"

pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
image_path = "saved_data/cifar_test100/apple0.png"
prompt = "USER: <image>\nGenerate a detailed and concise description less than 77 words of the bee in this image and the overall mood of this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion it conveys.\nASSISTANT:"
image = Image.open(image_path)
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 77})
print(outputs[0]["generated_text"])

USER:  
Generate a detailed and concise description less than 77 words of the bee in this image and the overall mood of this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion it conveys.
ASSISTANT: The image features a close-up of a yellow apple with a wooden stick in it. The apple is the main focus of the image, and it appears to be ripe and ready to be eaten. The wooden stick adds a rustic touch to the scene, and the overall mood of the image is warm and inviting. The colors in the image are predominantly


In [6]:
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
print(outputs[0]["generated_text"])

USER:  
Generate a detailed and concise description less than 77 words of the bee in this image and the overall mood of this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion it conveys.
ASSISTANT: The image features a close-up of a yellow apple with a wooden stick in it. The apple is the main focus of the image, and it appears to be ripe and ready to be eaten. The wooden stick adds a rustic touch to the scene, and the overall mood of the image is warm and inviting. The colors in the image are predominantly yellow, which adds to the sense of freshness and naturalness.


In [92]:

def extract_classname(image_path):
    # Split the path into parts on the '/'
    parts = image_path.split('/')
    # The last part is "apple0.png", so we take the last element
    filename = parts[-1]
    # Now we need to remove the number and extension, assuming the format is always classname + number + .png
    classname = ''.join([char for char in filename if not char.isdigit()]).replace('.png', '')
    return classname

def llava_single_pipeline(image_path, prompt):
    image = Image.open(image_path)
    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 150})
    full_response = outputs[0]["generated_text"]
    # print(full_response)
    assistant_index = full_response.find("ASSISTANT:") + len("ASSISTANT:")
    caption = full_response[assistant_index:].strip()
    return caption


def process_images(input_file_path, output_file_path, user_input):
    with open(input_file_path, 'r') as file:
        image_urls = file.readlines()

    with open(output_file_path, 'w') as output_file:
        for n, line in enumerate(image_urls):
        # for line in tqdm(image_urls, desc="Processing Images", leave=False):
            image_url = line.split()[0]
            image_url = image_url.strip()  # Remove any extra whitespace
            output = llava_single_pipeline(image_url, user_input)
            if n % 10 == 0:
                print(n)
            output_file.write(f"{image_url}: {output}\n")


# load dataset

In [93]:
import torch
import os
from torch import cat, Tensor
from torch.nn import Module
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Subset, ConcatDataset, TensorDataset
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torchvision import datasets, transforms
import torch.optim.lr_scheduler # ?
from torchvision.transforms import Compose, ToTensor, Normalize, RandomCrop, CenterCrop, RandomHorizontalFlip, Resize
from torchvision.transforms.functional import center_crop
from torchvision.models import resnet18, ResNet18_Weights
from torchvision.utils import save_image
from torchvision.transforms.functional import pil_to_tensor

In [94]:
# import torch
import torchvision
import torchvision.transforms as transforms
from PIL import Image
import os

stats = ((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

transform = transform_train = Compose([
    # Resize(224),
    # Resize(384),
    # RandomHorizontalFlip(),
    ToTensor(),
    # Normalize(*stats,inplace=True)
])

# Load the CIFAR-100 training set
trainset = torchvision.datasets.CIFAR100(root='data', train=True,download=True, transform=transform)

name_list = trainset.classes

Files already downloaded and verified


# synthetic classes

In [95]:
syn_classes = [5, 20, 83, 19, 62, 33, 74, 53, 4, 32, 40, 41, 64, 21, 49, 68, 65, 46, 72, 31, 8, 1, 18, 86, 85, 95, 25, 82, 66, 37, 78, 52, 3, 99, 28, 90, 17, 77, 79, 58]
real_classes = list(set([i for i in range(100)]) - set(syn_classes))
print(len(real_classes))

60


In [96]:
prompt = "USER: <image>\nGenerate a detailed and concise description less than 77 words of the bee in this image and the overall mood of this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion it conveys.\nASSISTANT:"
class_name = 'apple'
caption_prompt = f"USER: <image>\nGenerate a long, detailed and concise description of about 77 words using exactly the word '{class_name}' to describe the {class_name} in this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion the image conveys. Do not use synonyms or related terms for the main item but only using '{class_name}' to refer it.\nASSISTANT:"
print(caption_prompt)

USER: <image>
Generate a long, detailed and concise description of about 77 words using exactly the word 'apple' to describe the apple in this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion the image conveys. Do not use synonyms or related terms for the main item but only using 'apple' to refer it.
ASSISTANT:


In [97]:
from tqdm import tqdm

In [None]:
image_save_dir = 'saved_data/cifar_train_all_fortest'

# for i in real_classes:
for i in tqdm(real_classes, desc="Processing Each Class"):
    class_name = name_list[i]
    print(i, class_name)
    current_class_txt_path = os.path.join(image_save_dir, f"class{i}.txt")
    # print(current_class_txt_path)
    output_dict = 'saved_data/llava_cifar100_real60_500'
    if not os.path.exists(output_dict):
        os.makedirs(output_dict)
    output_path = os.path.join(output_dict, f'class{i}.txt')
    # caption_prompt = f"USER: <image>\nGenerate a detailed and concise description less than 77 words of the {class_name} in this image and the overall mood of this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion it conveys.\nASSISTANT:"
    caption_prompt = f"USER: <image>\nGenerate a long, detailed and concise description of about 77 words using exactly the word '{class_name}' to describe the {class_name} in this image. Focus on major colors, notable objects, and any distinct atmosphere or emotion the image conveys. Do not use synonyms or related terms for the main item but only using '{class_name}' to refer it.\nASSISTANT:"

    process_images(current_class_txt_path, output_path, caption_prompt)

Processing Each Class:   0%|                                                               | 0/60 [00:00<?, ?it/s]

0 apple
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:   2%|▊                                                 | 1/60 [26:57<26:30:59, 1617.96s/it]

2 baby
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:   3%|█▋                                                | 2/60 [52:02<24:59:44, 1551.45s/it]

6 bee
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:   5%|██▍                                             | 3/60 [1:18:58<25:01:44, 1580.77s/it]

7 beetle
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:   7%|███▏                                            | 4/60 [1:45:48<24:46:09, 1592.31s/it]

9 bottle
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:   8%|████                                            | 5/60 [2:12:39<24:25:46, 1599.02s/it]

10 bowl
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  10%|████▊                                           | 6/60 [2:39:43<24:06:43, 1607.47s/it]

11 boy
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  12%|█████▌                                          | 7/60 [3:03:58<22:55:51, 1557.57s/it]

12 bridge
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  13%|██████▍                                         | 8/60 [3:32:14<23:08:18, 1601.90s/it]

13 bus
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  15%|███████▏                                        | 9/60 [3:57:53<22:24:50, 1582.16s/it]

14 butterfly
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  17%|███████▊                                       | 10/60 [4:25:15<22:13:47, 1600.54s/it]

15 camel
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  18%|████████▌                                      | 11/60 [4:53:24<22:09:17, 1627.69s/it]

16 can
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  20%|█████████▍                                     | 12/60 [5:18:58<21:19:19, 1599.15s/it]

22 clock
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


Processing Each Class:  22%|██████████▏                                    | 13/60 [5:46:51<21:10:16, 1621.62s/it]

23 cloud
0
10


In [None]:
def count_txt_files(directory):
    """
    Counts the number of .txt files in the specified directory.

    Args:
    directory (str): The path to the directory to search for .txt files.

    Returns:
    int: The number of .txt files in the directory.
    """
    txt_count = 0
    # List all files and directories in the specified directory
    for entry in os.listdir(directory):
        # Construct full path
        full_path = os.path.join(directory, entry)
        # Check if it's a file with a .txt extension
        if os.path.isfile(full_path) and entry.endswith('.png'):
            txt_count += 1
    
    return txt_count

output_folder = 'saved_data/cifar_train_all_fortest'
count = count_txt_files(output_folder)
print(f"There are {count} .txt files in the folder.")
# count = count_png_files(output_folder)
# print(f"There are {count} .png files in the folder.")

In [61]:
!pip install tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
