In [None]:
!pip install -U -q datasets transformers[torch] timm wandb torchmetrics


In [None]:
import pandas as pd

df = pd.read_csv("Train.csv")
df.head()

In [None]:
# del class = NEG
df = df[df['class'] != 'NEG']
df.head()


In [None]:
label2id = {'Trophozoite': 0, 'WBC': 1}
id2label = {v: k for k, v in label2id.items()}


In [None]:
# apply on df
df['label'] = df['class'].map(label2id)
df.head()

In [None]:
# bbox: The object’s bounding box (in the Pascal VOC format) xmin, ymin, xmax, ymax
df['bbox'] = df.apply(lambda row: [row['xmin'], row['ymin'], row['xmax'], row['ymax']], axis=1) xmin, ymin, xmax, ymax
df.head()

In [None]:
import pandas as pd
from PIL import Image
import os
from datasets import ClassLabel, Sequence, Value
import uuid


def get_image_path(image_id):
    return f"/content/images/{image_id}"

def get_image_dimensions(image_path):
    with Image.open(image_path) as img:
        return img.size

def process_group(group):
    image_id = group.name
    image_path = get_image_path(image_id)

    # Get image dimensions without loading the entire image
    width, height = get_image_dimensions(image_path)

    bbox_ids = []
    categories = []
    bboxes = []
    areas = []

    for i, (_, row) in enumerate(group.iterrows()):
        bbox = row['bbox']
        area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])

        bbox_ids.append(f"{image_id}_{i:04d}")
        categories.append(row['label'])
        bboxes.append(bbox)
        areas.append(area)

    objects = {
            'bbox_id': bbox_ids,
            'category': categories,
            'bbox': bboxes,
            'area': areas
    }
    image_id_int = hash(image_id)
    return pd.Series({
            'image_id':  image_id_int
            ,
            'image': image_path,  # Store the image path instead of the PIL Image object
            'width': width,
            'height': height,
            'objects': objects
    })

# Process the DataFrame
def process_dataframe(df):
    return df.groupby('Image_ID').apply(process_group).reset_index(drop=True)

# Process the dataframe
new_df = process_dataframe(df)

# Define the schema
schema = {
        'image_id': Value(dtype='int64'),
        'image': Value(dtype='string'),  # Now it's a string (path) instead of PIL Image
        'width': Value(dtype='int64'),
        'height': Value(dtype='int64'),
        'objects': {
                'bbox_id': Sequence(Value(dtype='string')),
                'category': Sequence(Value(dtype='int64')),
                'bbox': Sequence(Sequence(Value(dtype='float64'), length=4)),
                'area': Sequence(Value(dtype='float64'))
        }
}

# Print the first few rows to verify the structure
print(new_df.head())

# Example of accessing data
first_row = new_df.iloc[0]
print(f"Image ID: {first_row['image_id']}")
print(f"Image path: {first_row['image']}")
print(f"Image dimensions: {first_row['width']}x{first_row['height']}")
print("Objects:")
print(f"  bbox_ids: {first_row['objects']['bbox_id']}")
print(f"  categories: {first_row['objects']['category']}")
print(f"  bboxes: {first_row['objects']['bbox']}")
print(f"  areas: {first_row['objects']['area']}")

# Function to load image on demand
def load_image(image_path):
    return Image.open(image_path)

# Example of loading an image on demand
# first_image = load_image(new_df.iloc[0]['image'])

In [None]:
from datasets import Dataset, Features, Value, Sequence, Image


# Define the schema (Features)
features = Features({
        'image_id': Value(dtype='int64'),
        'image': Image(decode=True),  # This will be the image path initially
        'width': Value(dtype='int64'),
        'height': Value(dtype='int64'),
        'objects': {
                'bbox_id': Sequence(Value(dtype='string')),
                'category': Sequence(Value(dtype='int64')),
                'bbox': Sequence(Sequence(Value(dtype='float64'), length=4)),
                'area': Sequence(Value(dtype='float64'))
        }
})

# Convert DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(new_df, features=features)

In [None]:
hf_dataset.push_to_hub('jonathansuru/ano4')