In [1]:
# ─── Cell: Load & Strip ID Columns ──────────────────────────────────────────
import pandas as pd
import os
# File paths
QA_PATH       = "Curated_VQA_Dataset_1.csv"
CURATION_PATH = "Data_Curation_1.csv"
BASE_PATH     = "/kaggle/input/abo-small/images/small"
# Load both DataFrames as strings
qa_df      = pd.read_csv(QA_PATH,       dtype=str)
curation_df = pd.read_csv(CURATION_PATH, dtype=str)

# Strip whitespace from the ID column in each
qa_df["main_image_id"]      = qa_df["main_image_id"].str.strip()
curation_df["main_image_id"] = curation_df["main_image_id"].str.strip()

# (Optional) Verify
# display(qa_df["main_image_id"].head())
# display(curation_df["main_image_id"].head())


In [2]:
# 3) Merge on actual 'path' column in cur_df
blip_vqa_df = qa_df.merge(
    curation_df[["main_image_id", "path"]],
    on="main_image_id",
    how="left"
)

# 4) Build a full filesystem path column
blip_vqa_df["image_path"] = blip_vqa_df["path"]

# 5) Drop the intermediate 'path' column
blip_vqa_df.drop(columns=["path"], inplace=True)

# Preview
blip_vqa_df.head

<bound method NDFrame.head of       main_image_id                             question        answer  \
0       810CTv64h2L            What color are the packs?         White   
1       810CTv64h2L        What type of product is this?       Laundry   
2       810CTv64h2L                      How many packs?            50   
3       810CTv64h2L                   What is the scent?     Unscented   
4       810CTv64h2L       What is the container's shape?           Bag   
...             ...                                  ...           ...   
93726   91L8i6XyTSL           What type of item is this?           Bag   
93727   91L8i6XyTSL  What type of attachment is visible?         Strap   
93728   91L8i6XyTSL  What type of attachment is visible?         Strap   
93729   91L8i6XyTSL               What brand is visible?  AmazonBasics   
93730   91L8i6XyTSL               What brand is visible?  AmazonBasics   

      used_metadata       image_path  
0             False  ae/ae638076.jpg  
1  

In [3]:
blip_vqa_df['main_image_id'].unique().size

19436

In [4]:
# ─── Cell: Split blip_vqa_df into 80/10/10 train/val/test by image ─────────────────────────
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

# assume blip_vqa_df is already loaded in the notebook and contains a 'main_image_id' column

# 1) First split: 80% train, 20% temp (val+test)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(
    gss.split(blip_vqa_df, groups=blip_vqa_df["main_image_id"])
)
train_df = blip_vqa_df.iloc[train_idx].reset_index(drop=True)
temp_df  = blip_vqa_df.iloc[temp_idx].reset_index(drop=True)

# 2) Second split: split temp_df into 50% val / 50% test → each 10% of original
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(
    gss2.split(temp_df, groups=temp_df["main_image_id"])
)
val_df  = temp_df.iloc[val_idx].reset_index(drop=True)
test_df = temp_df.iloc[test_idx].reset_index(drop=True)

# 3) Export to CSV for later use
train_df.to_csv("blip_vqa_train.csv", index=False)
val_df.to_csv("blip_vqa_val.csv",   index=False)
test_df.to_csv("blip_vqa_test.csv",  index=False)

# 4) Sanity check
print(f"Train set: {train_df.shape[0]} rows, {train_df['main_image_id'].nunique()} images")
print(f"Val   set: {val_df.shape[0]} rows, {val_df['main_image_id'].nunique()} images")
print(f"Test  set: {test_df.shape[0]} rows, {test_df['main_image_id'].nunique()} images")


Train set: 74975 rows, 15548 images
Val   set: 9384 rows, 1944 images
Test  set: 9372 rows, 1944 images
