In [None]:
%pip install clarifai-pyspark

# Testing ClarifaiPySpark SDK

### Getting Started

In [None]:
from clarifaipyspark.client import ClarifaiPySpark
from pyspark.sql import SparkSession

In [None]:
CLARIFAI_PAT = dbutils.secrets.get(scope="clarifai", key="clarifai-pat")

### Creating ClarifaiPyspark object & creating/fetching image dataset from app

In [None]:
cspark_obj = ClarifaiPySpark(user_id='mansi_k', app_id='databricks_tester_img', pat=CLARIFAI_PAT)

dataset_obj = cspark_obj.dataset(dataset_id='dataset1')

### Upload images from volume folder to Clarifai dataset

In [None]:
dataset_obj.upload_dataset_from_folder(folder_path='/Volumes/mansi_test/default/cat', input_type='image', labels=True)

### Fetching current inputs from image dataset

In [None]:
my_inputs = list(dataset_obj.list_inputs(input_type='image'))
my_inputs

[id: "img31"
 data {
   image {
     url: "https://mymodernmet.com/wp/wp-content/uploads/2023/01/how-to-draw-a-duck-fb-thumbnail.jpg"
     hosted {
       prefix: "https://data.clarifai.com"
       suffix: "users/mansi_k/apps/databricks_tester_img/inputs/image/2d7a8f8aeb24b42bdae2dce75fdecb6d"
       sizes: "orig"
       sizes: "tiny"
       sizes: "small"
       sizes: "large"
       crossorigin: "use-credentials"
     }
     image_info {
       width: 1200
       height: 630
       format: "JPEG"
       color_mode: "YUV"
     }
   }
   concepts {
     id: "id-duck"
     name: "duck"
     value: 1
     app_id: "databricks_tester_img"
   }
 }
 created_at {
   seconds: 1702316868
   nanos: 297862000
 }
 modified_at {
   seconds: 1702316869
   nanos: 507025000
 }
 status {
   code: INPUT_DOWNLOAD_SUCCESS
   description: "Download complete"
 },
 id: "dataset1-train-3647386"
 data {
   image {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_img/inputs/image/1

### Export image annotations to spark dataframe

In [None]:
annot_df = dataset_obj.export_annotations_to_dataframe()
annot_df

DataFrame[annotation: string, annotation_created_at: string, annotation_id: string, annotation_modified_at: string, annotation_user_id: string, input_id: string]

### Write annotations dataframe to volume as delta table

In [None]:
annot_df.write.mode("overwrite").saveAsTable("mansi_test.default.imgAnnsDeltaTable")


### Load annotations delta table from volume

In [None]:
spark = SparkSession.builder.appName("Clarifai-pyspark").getOrCreate()
spark.conf.set("spark.databricks.agent.id", "clarifai-pyspark")
df_delta = spark.read.table("mansi_test.default.imgAnnsDeltaTable")
df_delta.show(10)

+--------------------+---------------------+--------------------+----------------------+------------------+--------------------+
|          annotation|annotation_created_at|       annotation_id|annotation_modified_at|annotation_user_id|            input_id|
+--------------------+---------------------+--------------------+----------------------+------------------+--------------------+
|concepts {\n  id:...|  11/17/% 10:48:   %5|ed88ef462bfc420cb...|   11/17/% 10:48:   %5|           mansi_k|dataset1-train-33...|
|concepts {\n  id:...|  11/17/% 10:48:   %5|8a548049732840edb...|   11/17/% 10:48:   %5|           mansi_k|               img21|
|concepts {\n  id:...|  11/17/% 10:48:   %5|99e1544ff42a4695a...|   11/17/% 10:48:   %5|           mansi_k|               img11|
|concepts {\n  id:...|  11/17/% 10:41:   %5|170d036a60544e789...|   11/17/% 10:41:   %5|           mansi_k|                  c9|
|concepts {\n  id:...|  11/17/% 10:41:   %5|7edf0bc509454a9ca...|   11/17/% 10:41:   %5|         

### Loop through delta table rows & columns

In [None]:
for row in df_delta.collect()[:4]:
    print(row['annotation_id'])
    print(row['input_id'])
    print(row['annotation_user_id'])
    print(row['annotation_created_at'])
    print(row['annotation_modified_at'])
    print(row['annotation'])
    print("########################\n")

b3847d13486f4f1189315ad4565221db
c2
mansi_k
10/23/% 08:27:   %5
10/23/% 08:27:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################

f65d75b3446b40ffa012735a31dcbb62
c11
mansi_k
10/23/% 08:27:   %5
10/23/% 08:27:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################

76ac2fcb998440bc8d2a5fffd3a8ae1a
c8
mansi_k
11/17/% 10:41:   %5
11/17/% 10:41:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################

7edf0bc509454a9caa74ac15d5604211
c4
mansi_k
11/17/% 10:41:   %5
11/17/% 10:41:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################



### Testing export_images_to_volume()

In [None]:
dataset_obj.export_images_to_volume(path="/Volumes/mansi_test/default/test_vol1", input_response=my_inputs[:5])

### Load image csv file & upload image data into app using dataframe

In [None]:
spark = SparkSession.builder.appName("Clarifai-pyspark").getOrCreate()
spark.conf.set("spark.databricks.agent.id", "clarifai-pyspark")
df = spark.read.option("header",True).csv("/Volumes/mansi_test/default/test_vol1/img_data2.csv")
df.show()

+-------+--------------------+--------+
|inputid|               input|concepts|
+-------+--------------------+--------+
|  img11|https://img.freep...|     dog|
|  img21|https://images.un...|    duck|
|  img31|https://mymodernm...|    duck|
+-------+--------------------+--------+



In [None]:
dataset_obj.upload_dataset_from_dataframe(dataframe=df, input_type='image', df_type='url', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]Uploading inputs: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]Uploading inputs: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]


### Upload images from a delta table to Clarifai app

In [None]:
dataset_obj.upload_dataset_from_table(table_path="/Volumes/mansi_test/default/test_vol1/imgdeltatable1", 
                                      input_type="image",
                                      table_type="url",
                                      labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading inputs: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]Uploading inputs: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]


### Using a custom dataloader to upload images from Volume

In [None]:
# Form a dataloader
from clarifai.datasets.upload.utils import load_module_dataloader

food_dataloader = load_module_dataloader(module_dir="/Volumes/mansi_test/default/test_vol1/food-101")

Uploading Dataset:   0%|          | 0/1 [00:00<?, ?it/s]Uploading Dataset: 100%|██████████| 1/1 [00:02<00:00,  2.61s/it]Uploading Dataset: 100%|██████████| 1/1 [00:02<00:00,  2.61s/it]


### Export annotations with their associated images into Databricks Volume

In [None]:
dataset_obj.export_annotations_to_volume(volumepath="/Volumes/mansi_test/default/test_vol4",)

Exporting Images:   0%|          | 0/20 [00:00<?, ?it/s]Exporting Images:   5%|▌         | 1/20 [00:01<00:32,  1.71s/it]Exporting Images:  20%|██        | 4/20 [00:01<00:05,  2.84it/s]Exporting Images:  30%|███       | 6/20 [00:02<00:04,  3.15it/s]Exporting Images:  40%|████      | 8/20 [00:03<00:03,  3.10it/s]Exporting Images:  45%|████▌     | 9/20 [00:03<00:03,  3.31it/s]Exporting Images:  55%|█████▌    | 11/20 [00:03<00:02,  4.03it/s]Exporting Images:  65%|██████▌   | 13/20 [00:03<00:01,  5.47it/s]Exporting Images:  80%|████████  | 16/20 [00:04<00:00,  4.65it/s]Exporting Images:  85%|████████▌ | 17/20 [00:04<00:00,  4.62it/s]Exporting Images:  95%|█████████▌| 19/20 [00:04<00:00,  5.55it/s]Exporting Images: 100%|██████████| 20/20 [00:04<00:00,  4.08it/s]


In [None]:
df_delta = spark.read.format("delta").load("/Volumes/mansi_test/default/test_vol4")
df_delta.show(4)

+--------------------+---------------------+--------------------+----------------------+------------------+--------------------+--------------------+
|          annotation|annotation_created_at|       annotation_id|annotation_modified_at|annotation_user_id|           image_url|            input_id|
+--------------------+---------------------+--------------------+----------------------+------------------+--------------------+--------------------+
|concepts {\n  id:...|  11/17/% 10:48:   %5|8d1809e5078d40579...|   11/17/% 10:48:   %5|           mansi_k|https://data.clar...|dataset1-train-35...|
|concepts {\n  id:...|  11/17/% 10:48:   %5|8e246698055843cfa...|   11/17/% 10:48:   %5|           mansi_k|https://data.clar...|dataset1-train-24...|
|concepts {\n  id:...|  11/17/% 10:41:   %5|f044bc745f754544b...|   11/17/% 10:41:   %5|           mansi_k|https://data.clar...|                  c7|
|concepts {\n  id:...|  10/23/% 08:27:   %5|0a28d0a6b7a04234b...|   10/23/% 08:27:   %5|           m

### Creating ClarifaiPyspark object & creating/fetching text dataset from app

In [None]:
cspark_obj = ClarifaiPySpark(user_id='mansi_k', app_id='databricks_tester_txt', pat=CLARIFAI_PAT)

dataset_obj = cspark_obj.dataset(dataset_id='dataset1')

### Fetching current inputs from text dataset

In [None]:
my_inputs = list(dataset_obj.list_inputs(input_type='text'))

my_inputs

[id: "XFmGD0xHlNXgGIXF"
 data {
   text {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_txt/inputs/text/65c1aa5487711a23f7477200fd01e253"
     hosted {
       prefix: "https://data.clarifai.com"
       suffix: "users/mansi_k/apps/databricks_tester_txt/inputs/text/65c1aa5487711a23f7477200fd01e253"
       sizes: "orig"
       crossorigin: "use-credentials"
     }
     text_info {
       char_count: 3
       encoding: "UTF8"
     }
   }
 }
 created_at {
   seconds: 1698678263
   nanos: 736342000
 }
 modified_at {
   seconds: 1698678273
   nanos: 49546000
 }
 status {
   code: INPUT_DOWNLOAD_SUCCESS
   description: "Download complete"
 },
 id: "Ak1n8DZ1l1RWKATv"
 data {
   text {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_txt/inputs/text/5a8dd3ad0756a93ded72b823b19dd877"
     hosted {
       prefix: "https://data.clarifai.com"
       suffix: "users/mansi_k/apps/databricks_tester_txt/inputs/text/5a8dd3ad0756a93ded72b823b19

### Testing export_text_to_volume()

Q- Create folders acc to concept name?   
Q- Save text as csv?

In [None]:
dataset_obj.export_text_to_volume(path="/Volumes/mansi_test/default/test_vol1", input_response=my_inputs)

### Upload text from volume csv to Clarifai dataset

In [None]:
dataset_obj.upload_dataset_from_csv(csv_path='/Volumes/mansi_test/default/test_vol1/emotions_data1.csv', input_type='text', labels=True, csv_type='raw')

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]Uploading inputs: 100%|██████████| 1/1 [00:20<00:00, 20.56s/it]Uploading inputs: 100%|██████████| 1/1 [00:20<00:00, 20.56s/it]


### Load test csv file & upload text data into app using dataframe

In [None]:
spark = SparkSession.builder.appName("Clarifai-pyspark").getOrCreate()
spark.conf.set("spark.databricks.agent.id", "clarifai-pyspark")
df = spark.read.option("header",True).csv("/Volumes/mansi_test/default/test_vol1/emotions_data1.csv")
df.show()

+-------+--------------------+--------+
|inputid|               input|concepts|
+-------+--------------------+--------+
|    t11|   I am feeling good|   happy|
|    t21|This is a guava tree| neutral|
|    t31|This is a saddeni...|     sad|
+-------+--------------------+--------+



In [None]:
dataset_obj.upload_dataset_from_dataframe(dataframe=df, input_type='text', df_type='raw', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  5.48it/s]Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  5.45it/s]
