In [6]:
import pandas as pd
import numpy as np
from typing import Dict

from datasets import load_dataset, Dataset, Features, Value

from sklearn.model_selection import train_test_split

# Data

In [7]:
df2 = pd.read_json('../data/vlm.jsonl', lines=True).sample(n=100)

categories = list(filter(lambda x: x != None, pd.DataFrame([i for i in list(df2['annotations'].apply(lambda x: [i['caption'] for i in x]))]).to_numpy().flatten()))
num_cat = len(categories)
categories = pd.DataFrame(categories)
categories.drop_duplicates(inplace=True)
categories = categories.values.flatten().tolist()
id2label = {index: x for index, x in enumerate(categories, start=0)}
label2id = {v: k for k, v in id2label.items()}

# df2['id'] = list(map(lambda x: {'id':x}, df2.index)) #not correct
df2["image_id"] = df2.index
df2['width'] = df2['image'].apply(lambda x: Image.open('../data/images/' + x).size)
df2['height'] = df2['width'].apply(lambda x: x[1])
df2['width'] = df2['width'].apply(lambda x: x[0])
df2['img'] = df2['image'].apply(lambda x: Image.open('../data/images/' +x))
# df2['height'] = df2['annotations'].apply(lambda x: [i['bbox'][3] for i in x])
df2['area'] = df2['annotations'].apply(lambda x: {"area":[i['bbox'][2] * i['bbox'][3] for i in x]})
df2['bbox'] = df2['annotations'].apply(lambda x: {"bbox":[i['bbox'] for i in x]})
df2['category'] = df2['annotations'].apply(lambda x: {'category':[label2id[i['caption']] for i in x]})
df2 = df2.drop(columns=['annotations'])

counter = 0
lst = []
for i in df2['category']:
	size = len(i['category'])
	each = list(np.arange(counter, counter + size))
	lst.append({'id':each})
	counter += size
df2['id'] = lst

df_vlm = pd.DataFrame({
	"file_name" : df2['image'],
	"image_id" : df2['image_id'],
	"image" : df2['img'],
	"width" : df2['width'],
	"height": df2['height'],
	"objects": df2.apply(lambda x: {'id': x['id']['id'], 'area': x['area']['area'],'bbox': x['bbox']['bbox'], 'category': x['category']['category']}, axis=1)
})
# df_vlm.to_csv('meta_data.csv', index=False)
df_vlm

Unnamed: 0,file_name,image_id,image,width,height,objects
0,image_0.jpg,0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [0, 1, 2, 3], 'area': [7296, 672, 4864,..."
1,image_1.jpg,1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [4, 5, 6, 7, 8, 9], 'area': [2304, 6912..."
2,image_2.jpg,2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [10, 11, 12, 13, 14, 15], 'area': [2912..."
3,image_3.jpg,3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [16, 17, 18, 19, 20, 21, 22], 'area': [..."
4,image_4.jpg,4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [23, 24, 25], 'area': [3360, 1152, 3648..."
...,...,...,...,...,...,...
195,image_195.jpg,195,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [1038, 1039, 1040, 1041, 1042], 'area':..."
196,image_196.jpg,196,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [1043, 1044, 1045, 1046], 'area': [2048..."
197,image_197.jpg,197,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [1047, 1048, 1049, 1050, 1051], 'area':..."
198,image_198.jpg,198,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1520,870,"{'id': [1052, 1053, 1054, 1055, 1056, 1057, 10..."


# Split Dataset

In [8]:
train, test = train_test_split(df_vlm, test_size=0.2, random_state=1)

train, val = train_test_split(train, test_size=0.125, random_state=1) # 0.125 x 0.8 = 0.1

train_csv = train.drop(columns=['image'])
train_csv.to_csv('../data/train/metadata.csv', index=False)
pd.DataFrame({
	'name':train['file_name'],
	'img':train['image']
}).apply(lambda x: x[1].save('../data/train/' + x[0]), axis=1)

val_csv = val.drop(columns=['image'])
val_csv.to_csv('../data/val/metadata.csv', index=False)
pd.DataFrame({
	'name':val['file_name'],
	'img':val['image']
}).apply(lambda x: x[1].save('../data/val/' + x[0]), axis=1)

test_csv = test.drop(columns=['image'])
test_csv.to_csv('../data/test/metadata.csv', index=False)
pd.DataFrame({
	'name':test['file_name'],
	'img':test['image']
}).apply(lambda x: x[1].save('../data/test/' + x[0]), axis=1)

train.shape, val.shape, test.shape

((140, 6), (20, 6), (40, 6))

In [9]:
vlm_data = load_dataset('ekmi00/vlm-images')
vlm_data

Resolving data files:   0%|          | 0/141 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 140
    })
    validation: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 20
    })
    test: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 40
    })
})