In [1]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
import polars as pl
from PIL import Image
import numpy as np
import io

load_dotenv()

True

In [3]:
def load_and_convert_image(filepath: str):
    full_path = os.path.join(os.environ["PATH_ARCHIVE"], filepath.replace("/", os.sep))
    im = Image.open(full_path)
    return np.array(im).tobytes()


def load_and_convert_image_2(filepath: str):
    full_path = os.path.join(os.environ["PATH_ARCHIVE"], filepath.replace("/", os.sep))
    im = Image.open(full_path)
    image_bytes = io.BytesIO()
    im.save(image_bytes, format="JPEG")
    return image_bytes.getvalue()


def transform_df(df: pl.DataFrame):
    df = df.with_columns(pl.col("image:FILE").str.split(by="/").alias("split"))
    df = df.select(
        pl.col("split").list.get(2).str.strip_suffix("_c.jpg").alias("id"),
        pl.col("split").list.get(1).alias("flower_type"),
        pl.col("category"),
        pl.col("image:FILE")
        .map_elements(load_and_convert_image_2, return_dtype=pl.Binary)
        .alias("data"),
        pl.col("image:FILE").alias("filepath"),
    )
    filepath = df.row(0, named=True)["filepath"]
    print(len(load_and_convert_image(filepath)))
    print(len(load_and_convert_image_2(filepath)))
    print(len(df.row(0, named=True)["data"]))
    return df


def upload_to_collection(collection, csv_filename, reset=True):
    df = pl.read_csv(os.path.join(os.environ["PATH_ARCHIVE"], csv_filename))

    if reset:
        result = collection.delete_many({})
        print(result.deleted_count)

    # transform the dataset df
    df = transform_df(df)

    # insert into mongodb
    result = collection.insert_many(df.to_dicts())
    document_ids = result.inserted_ids
    print("# of documents inserted: " + str(len(document_ids)))
    print(f"_ids of inserted documents: {document_ids}")

In [2]:
# setup database
client = MongoClient(os.environ["URI"])

# Get reference to 'bank' database
db = client.flowers

# Get a reference to train and test collection
collections = {"train.csv": db.train, "val.csv": db.test}

In [3]:
for csv_filename, collection in collections.items():
    upload_to_collection(collection, csv_filename)

NameError: name 'upload_to_collection' is not defined

In [4]:
# Create indices for the image class
import pymongo


index1 = pymongo.IndexModel([("category", pymongo.ASCENDING)])
index2 = pymongo.IndexModel([("flower_type", pymongo.TEXT)])

for collection in collections.values():
    collection.create_indexes([index1, index2])

In [7]:
for index in collection.list_indexes():
    print(index)

SON([('v', 2), ('key', SON([('_id', 1)])), ('name', '_id_')])
SON([('v', 2), ('key', SON([('category', 1)])), ('name', 'category_1')])
SON([('v', 2), ('key', SON([('_fts', 'text'), ('_ftsx', 1)])), ('name', 'flower_type_text'), ('weights', SON([('flower_type', 1)])), ('default_language', 'english'), ('language_override', 'language'), ('textIndexVersion', 3)])


In [6]:
client.close()