<a href="https://colab.research.google.com/github/AyushDhimann/Amazon-ML-Challenge-2024/blob/main/Amazon_ML_Challenge_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Heating Up

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil

# Define paths
drive_path_train = '/content/drive/MyDrive/dataset/train.csv'
drive_path_test = '/content/drive/MyDrive/dataset/test.csv'
colab_path_train = '/content/train.csv'
colab_path_test = '/content/test.csv'

# Copy files to Colab working directory
shutil.copy(drive_path_train, colab_path_train)
shutil.copy(drive_path_test, colab_path_test)


In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q accelerate datasets peft bitsandbytes peft

# Downloading

In [None]:
#code to chunck 20 values of train for each entity_name and test aswell
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from datasets import Dataset, DatasetDict
from huggingface_hub import login

# Create directories for saving images
train_image_dir = "/content/images/train"
os.makedirs(train_image_dir, exist_ok=True)

test_image_dir = "/content/images/test"
os.makedirs(test_image_dir, exist_ok=True)

# Step 1: Load and shorten the dataset (ensure 20 images for each unique entity_name)
def shorten_dataset(train_df, test_df, limit=100):
    # Create 'id' columns
    train_df['id'] = [f"train_{i}" for i in range(len(train_df))]
    test_df['id'] = [f"test_{i}" for i in range(len(test_df))]

    # Ensure 20 rows for every unique entity_name in the train dataset
    if 'entity_name' in train_df.columns:
        train_df_grouped = train_df.groupby('entity_name')
        train_df_list = []

        for name, group in train_df_grouped:
            if len(group) >= 20:
                # If there are 20 or more rows, sample 20 rows
                sampled_group = group.sample(20, random_state=42)
            else:
                # If fewer than 20 rows, repeat rows to make up 20 rows
                sampled_group = group.sample(20, replace=True, random_state=42)
            train_df_list.append(sampled_group)

        # Concatenate the list into a new dataframe
        train_df = pd.concat(train_df_list)

    # Limit the test dataset to 100 rows
    test_df = test_df.head(limit)

    # Add 'query' and 'answers' columns to the train dataset
    if 'entity_name' in train_df.columns and 'entity_value' in train_df.columns:
        train_df['query'] = train_df['entity_name'].apply(lambda x: f"What is the {x}?")
        train_df['answers'] = train_df['entity_value']

    # Add 'query' column and dummy 'answers' to the test dataset
    if 'entity_name' in test_df.columns:
        test_df['query'] = test_df['entity_name'].apply(lambda x: f"What is the {x}?")
        test_df['answers'] = ""  # Add a dummy 'answers' column

    # Save shortened datasets
    train_df.to_csv("train_short.csv", index=False)
    test_df.to_csv("test_short.csv", index=False)

    return train_df, test_df

# Step 2: Download images in batches and link them to the dataset
def download_image(url, img_id, save_dir):
    """Download an image from the URL and save it locally."""
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img_filename = f"{img_id}.jpg"
        img_path = os.path.join(save_dir, img_filename)
        img.save(img_path)
        return img_path
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

def download_images_in_batches(df, image_dir, batch_size=20):
    """Download images in batches and update the dataset with file paths."""
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        print(f"Downloading batch {i // batch_size + 1}")
        for index, row in batch.iterrows():
            img_path = download_image(row['image_link'], row['id'], image_dir)
            df.loc[index, 'image_path'] = img_path  # Use .loc to avoid SettingWithCopyWarning
    return df

# Step 3: Load CSV data
train_csv = 'train.csv'
test_csv = 'test.csv'

train_df = pd.read_csv(train_csv, low_memory=False)
test_df = pd.read_csv(test_csv, low_memory=False)

# Step 4: Shorten the dataset (limit to 100 rows, ensure 20 per entity_name)
train_df_short, test_df_short = shorten_dataset(train_df, test_df)

# Step 5: Download images for train and test datasets
train_df_short = download_images_in_batches(train_df_short, train_image_dir)
test_df_short = download_images_in_batches(test_df_short, test_image_dir)

# Step 6: Create Hugging Face Dataset objects
def create_train_dataset(df):
    if 'query' in df.columns and 'answers' in df.columns:
        dataset_dict = {
            'id': df['id'],
            'image': df['image_path'],
            'query': df['query'],
            'answers': df['answers']
        }
        return Dataset.from_dict(dataset_dict)
    else:
        raise KeyError("'query' or 'answers' column not found in the train dataset")

def create_test_dataset(df):
    if 'query' in df.columns and 'answers' in df.columns:
        dataset_dict = {
            'id': df['id'],
            'image': df['image_path'],
            'query': df['query'],
            'answers': df['answers']  # Include the dummy 'answers' column
        }
        return Dataset.from_dict(dataset_dict)
    else:
        raise KeyError("'query' or 'answers' column not found in the test dataset")

# Step 7: Create DatasetDict for train and test splits
try:
    print("Creating train and test datasets for Hugging Face")
    train_dataset = create_train_dataset(train_df_short)
    test_dataset = create_test_dataset(test_df_short)
except KeyError as e:
    print(e)
    raise

split_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})


# Hugging Face Auth

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# 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

In [None]:
from datasets import load_dataset

# train_dataset = load_dataset("ayushdhiman/TryTry", split="train")
eval_dataset = load_dataset("ayushdhiman/TryTry", split="test")

In [None]:
from PIL import Image
import IPython.display as display

# Load and display the image
image_path = train_dataset[25]['image']
img = Image.open(image_path)
display.display(img)


# Fine Tuning

In [None]:
import torch
from peft import LoraConfig
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration

DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = True

processor = AutoProcessor.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    do_image_splitting=False
)

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.float16,
        quantization_config=bnb_config if USE_QLORA else None,
    )
    model.add_adapter(lora_config)
    model.enable_adapters()
else:
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b",
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2", # This works for A100 or H100
    ).to(DEVICE)

In [None]:
class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")
        ]

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            print(f"Processing example: {example}")  # Debugging statement

            image = example["image"]
            question = example["query"]  # 'query' is treated as a string
            answer = example["answers"]  # 'answers' is treated as a string

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Answer briefly."},
                        {"type": "image"},
                        {"type": "text", "text": question}

                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            text = self.processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())
            images.append([image])

        batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch


from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "IDEFICS_DocVQA_1",
    learning_rate = 2e-4,
    fp16 = True,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 8,
    dataloader_pin_memory = False,
    save_total_limit = 3,
    evaluation_strategy ="steps",
    save_strategy = "steps",
    eval_steps = 10,
    save_steps = 25,
    max_steps = 25,
    logging_steps = 5,
    remove_unused_columns = False,
    push_to_hub=False,
    label_names = ["labels"],
    load_best_model_at_end = False,
    report_to = "none",
    optim = "paged_adamw_8bit",
)

# Data collator instance
data_collator = MyDataCollator(processor)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)


# Training the model

In [None]:
trainer.train()

# Prompting the model

In [None]:
from PIL import Image

# Ensure that `image` is loaded as a single image, not a list or batch
image_path = "/content/images/test/test_11.jpg"  # Replace with actual path
image = Image.open(image_path)  # Load the image properly

query = "What is the depth? "



# Process the single image with the query
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Answer briefly."},
            {"type": "image"},  # Single image token
            {"type": "text", "text": query}
        ]
    }
]

# Apply the chat template
text = processor.apply_chat_template(messages, add_generation_prompt=True)

# Process the inputs correctly for a single image
inputs = processor(text=[text.strip()], images=[image], return_tensors="pt", padding=True)

# Generate output from the model
generated_ids = model.generate(**inputs, max_new_tokens=64)

# Decode the generated text
generated_texts = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
print(generated_texts)


# Final Test Preprocessing

In [None]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO

# Create a directory for saving test images
test_image_dir = "/content/images/test"
os.makedirs(test_image_dir, exist_ok=True)

# Function to download an image from the URL and save it locally with sequential numbering
def download_image(url, img_id, save_dir):
    """Download an image from the URL and save it locally as <img_id>.jpeg."""
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img_filename = f"{img_id}.jpeg"
        img_path = os.path.join(save_dir, img_filename)
        img.save(img_path)
        return img_path
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

# Function to download images for the test dataset
def download_test_images(df, image_dir):
    """Download images for the test dataset and update the dataframe with image paths and IDs."""
    img_id = 0  # Start image numbering from 1
    for index, row in df.iterrows():
        # Download the image and save it with a sequential filename (e.g., 1.jpeg, 2.jpeg, etc.)
        img_path = download_image(row['image_link'], img_id, image_dir)
        # Update the dataframe with the new image path and sequential ID
        df.at[index, 'id'] = img_id
        df.at[index, 'image_path'] = img_path
        img_id += 1  # Increment the image ID for the next row
    return df

# Load the test CSV data
test_csv = 'test.csv'
test_df = pd.read_csv(test_csv, low_memory=False)

# Keep only the first 50 rows
test_df = test_df.head(10)

# Ensure the final CSV contains only the specified columns
columns_to_keep = ['index', 'group_id', 'entity_name', 'image_link']  # Adjust based on the original test.csv structure
test_df = test_df[columns_to_keep]

# Download images and update the test dataframe for the first 50 rows
test_df = download_test_images(test_df, test_image_dir)

# Save the final CSV file with the required columns
test_df_final = test_df[['index', 'id', 'image_path', 'group_id', 'entity_name']]
test_df_final.to_csv("test_processed.csv", index=False)

print("Image download and CSV processing for first 50 items complete.")






''' for complete data  '''

# import os
# import pandas as pd
# import requests
# from PIL import Image
# from io import BytesIO

# # Create a directory for saving test images
# test_image_dir = "/content/images/test"
# os.makedirs(test_image_dir, exist_ok=True)

# # Function to download an image from the URL and save it locally with sequential numbering
# def download_image(url, img_id, save_dir):
#     """Download an image from the URL and save it locally as <img_id>.jpeg."""
#     try:
#         response = requests.get(url)
#         img = Image.open(BytesIO(response.content))
#         img_filename = f"{img_id}.jpeg"
#         img_path = os.path.join(save_dir, img_filename)
#         img.save(img_path)
#         return img_path
#     except Exception as e:
#         print(f"Error downloading {url}: {e}")
#         return None

# # Function to download images for the test dataset
# def download_test_images(df, image_dir):
#     """Download images for the test dataset and update the dataframe with image paths and IDs."""
#     img_id = 1  # Start image numbering from 1
#     for index, row in df.iterrows():
#         # Download the image and save it with a sequential filename (e.g., 1.jpeg, 2.jpeg, etc.)
#         img_path = download_image(row['image_link'], img_id, image_dir)
#         # Update the dataframe with the new image path and sequential ID
#         df.at[index, 'id'] = img_id
#         df.at[index, 'image_path'] = img_path
#         img_id += 1  # Increment the image ID for the next row
#     return df

# # Load the test CSV data
# test_csv = 'test.csv'
# test_df = pd.read_csv(test_csv, low_memory=False)

# # Ensure the final CSV contains only the specified columns
# columns_to_keep = ['index', 'group_id', 'entity_name', 'image_link']  # Adjust based on the original test.csv structure
# test_df = test_df[columns_to_keep]

# # Download images and update the test dataframe
# test_df = download_test_images(test_df, test_image_dir)

# # Save the final CSV file with the required columns
# test_df_final = test_df[['index', 'id', 'image_path', 'group_id', 'entity_name']]
# test_df_final.to_csv("test_processed.csv", index=False)

# print("Image download and CSV processing complete.")





# Output Pipeline

In [None]:
import os
import pandas as pd
from PIL import Image

# Load the CSV file containing image paths and entity names
csv_path = "test_processed.csv"
df = pd.read_csv(csv_path)

# Function to process the images and generate output from the model
def process_images_and_generate_output(df, processor, model, output_csv="output.csv"):
    results = []

    for index, row in df.iterrows():
        image_path = row['image_path']
        entity = row['entity_name']
        img_id = row['index']

        # Load the image
        try:
            image = Image.open(image_path)

            # Dynamically create the query with the entity name
            query = f"What is the {entity} ?"

            # Prepare the message with the query and image
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": " Answer Briefly "},
                        {"type": "image"},
                        {"type": "text", "text": query}
                    ]
                }
            ]

            # Apply the chat template
            text = processor.apply_chat_template(messages, add_generation_prompt=True)

            # Process the inputs for a single image and query
            inputs = processor(text=[text.strip()], images=[image], return_tensors="pt", padding=True)

            # Generate the model output
            generated_ids = model.generate(**inputs, max_new_tokens=64)

            # Decode the generated text
            generated_texts = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
            generated_text = generated_texts[0].strip()

            # Save the result as (id, generated_text)
            results.append({"index": img_id, "prediction": generated_text})

        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            results.append({"index": img_id, "prediction": ""})  # Save empty result for error cases

    # Save the results to a new CSV file
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv, index=False)
    print(f"Output saved to {output_csv}")

# Example call (assuming you have defined `processor` and `model` already)
process_images_and_generate_output(df, processor, model, output_csv="final_output.csv")


# Parallel Pipeline

In [None]:
import os
import pandas as pd
from PIL import Image
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch

# Function to load and process a single image
def process_single_image(image_path, entity, img_id, processor, model):
    try:
        # Load the image
        image = Image.open(image_path)

        # Dynamically create the query with the entity name
        query = f"What is the {entity}?"

        # Prepare the message with the query and image
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": " Answer Briefly "},
                    {"type": "image"},
                    {"type": "text", "text": query}
                ]
            }
        ]

        # Apply the chat template
        text = processor.apply_chat_template(messages, add_generation_prompt=True)

        # Process the inputs for a single image and query
        inputs = processor(text=[text.strip()], images=[image], return_tensors="pt", padding=True)

        # Generate the model output
        with torch.no_grad():  # Ensure no gradients are calculated
            generated_ids = model.generate(**inputs, max_new_tokens=64)

        # Decode the generated text
        generated_texts = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
        generated_text = generated_texts[0].strip()

        # Return the result as (id, generated_text)
        return {"index": img_id, "prediction": generated_text}

    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return {"index": img_id, "prediction": ""}  # Save empty result for error cases

# Batch processing and parallelization
def process_images_and_generate_output(df, processor, model, output_csv="output.csv", num_threads=8):
    results = []

    # Use a ThreadPoolExecutor to process images in parallel
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_row = {
            executor.submit(
                process_single_image, row['image_path'], row['entity_name'], row['index'], processor, model
            ): row for _, row in df.iterrows()
        }

        # Collect results as they are completed
        for future in as_completed(future_to_row):
            result = future.result()
            results.append(result)

    # Save the results to a new CSV file
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv, index=False)
    print(f"Output saved to {output_csv}")

# Example call (assuming you have defined `processor` and `model` already)


# Load the CSV file containing image paths and entity names
csv_path = "test_processed.csv"  # Ensure the path is correct
df = pd.read_csv(csv_path)



process_images_and_generate_output(df, processor, model, output_csv="final_output.csv", num_threads=12)