In [None]:
%pip install git+https://github.com/AgDMALabs-Public/ag-vision-dataops.git

In [None]:
from roboflow import Roboflow
from matplotlib import pyplot as plt

import pandas as pd
from pyspark.sql import SparkSession

# Assumes you have a SparkSession named 'spark' available
spark = SparkSession.builder.getOrCreate()

In [None]:
from ag_vision.data_io import roboflow_io as rio
from ag_vision.data_io import annotation_io as aio

In [None]:
imgs_df = spark.table("use1_prod_artemis_catalog_3718194974443840.production.images_table").toPandas()

In [None]:
# This is the API key needed to push data to Roboflow. Go to roboflow, settings, account, select a workspace, look for API keys. Use the Private API key. This should be stored in a secreats file.
api_key = ""

# This is the CG workspace, if you are working from a different workspace you will nned to change this.
rf_workspace = "cgiar-workspace"

# This is the project that you will push the images to. If you want to push to a new project, you need to create that project in Roboflow first, copy the name and paste it here.
rf_project = "crop_type_classification-g9ywv"

# This is the path to the data in databricks, change based on the location of your project.
project_path = "/Volumes/use1_prod_artemis_catalog_3718194974443840/production/data/artemis"

# The annotation type should be in this list ...['object_detection', 'instance_segmentation', 'classification', 'semantic_segmentation']
annotation_type='classification'
# This is the name of the task, that you want to build a model for.
task_name='crop_type_classification'
# This is the name of the batch of images that you are pushing. I usually just use the task name and then add the date the images are pushed. But if you are labeling a test set, you may want to put 'test' in the batch name. In the end many batches will be pushed per task.
batch_name='crop_type_classification_8-28-2025'

rf = Roboflow(api_key=api_key)
project = rf.workspace(rf_workspace).project(rf_project)

In [None]:
# Look at the images available for labeling
imgs_df.head()

## Add in other columns to select images from

In [None]:
imgs_df = imgs_df[~imgs_df['metadata_path'].isna()]
imgs_df.loc[:, 'grouped_exposure'] = imgs_df['exposure'].round(-1)

## Develop logic to select images for annotating.

In [None]:
annotation_df_list = []
sample_size = 2

for idx, df in imgs_df.groupby(['season', 'trial', 'protocol', 'grouped_exposure', 'orientation']):
    print(idx)
    if len(df) < sample_size:
        annotation_df_list.append(df)
    else:
        annotation_df_list.append(df.sample(sample_size))

annotation_df = pd.concat(annotation_df_list)
annotation_df.head()
print(len(annotation_df))

## Look at your data.

In [None]:
annotation_df['exposure'].hist()
plt.show()

## Create an annotation batch.
* This Saves the images in the annotation folder, gives the image an new UUID, makes a metadata file.

In [None]:
aio.create_annotation_batch(img_list=annotation_df['image_path'].tolist(),
                            project_path=project_path,
                            annotation_type=annotation_type,
                            task_name=task_name,
                            batch_name=batch_name)

## Data Augmentation
* At this point if you need to augment the images you can read them in from the batch augment them and then save them back to the annotation folder.

In [None]:
# logic to augment the images, resize, crop, tile and make more images... just make sure there is one image

## Push the batch up to Roboflow.
* Roboflow will do automatic dedup, so it it fails you can just rerun the code and they will deal with the duplicate images.

In [None]:
# Roboflow will check for duplicate images and skip them on upload. That is why we have some missing images in RF.
rio.upload_image_batch_to_roboflow(rf_project=project,
                                   project_path=project_path,
                                   annotation_type=annotation_type,
                                   task_name=task_name,
                                   batch_name=batch_name,
                                   split='train',
                                   tmp_copy = True)

## Label, train, then download
