In [1]:
import os
import glob
import torch
from PIL import Image
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoProcessor,
    MllamaForConditionalGeneration,
)
from huggingface_hub import login

In [None]:
login()

In [2]:
class GarbageDataset:
    def __init__(self, base_dir):
        self.base_dir = base_dir
        self.train_data = self._load_data('CVPR_2024_dataset_Train')
        self.test_data = self._load_data('CVPR_2024_dataset_Test')
        self.val_data = self._load_data('CVPR_2024_dataset_Val')

    def _load_data(self, dataset_type):
        data = {}
        dataset_path = os.path.join(self.base_dir, dataset_type)
        for category in os.listdir(dataset_path):
            category_path = os.path.join(dataset_path, category)
            if os.path.isdir(category_path):
                data[category] = glob.glob(os.path.join(category_path, '*'))
        return data

    def get_train_data(self):
        return self.train_data

    def get_test_data(self):
        return self.test_data

    def get_val_data(self):
        return self.val_data

In [3]:
class TransformerModel:

    def __init__(self, model_name="Meta-LLaVA/llava-llama-2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

    def encode(self, text):
        return self.tokenizer(text, return_tensors="pt")

    def generate(self, input_text, max_length=50):
        inputs = self.encode(input_text)
        outputs = self.model.generate(inputs["input_ids"], max_length=max_length)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


class Tokenizer:
    def __init__(self, model_name="gpt2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize(self, text):
        return self.tokenizer.tokenize(text)

    def decode(self, token_ids):
        return self.tokenizer.decode(token_ids, skip_special_tokens=True)

In [None]:
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
# Example usage:
transformer_model = TransformerModel()
tokenizer = Tokenizer()

input_text = "Hello, how are you?"
encoded_input = transformer_model.encode(input_text)
generated_text = transformer_model.generate(input_text)

print("Encoded Input:", encoded_input)
print("Generated Text:", generated_text)