In [None]:
%pip install clarifai-pyspark

# Testing ClarifaiPySpark SDK

### Getting Started

In [None]:
from clarifaipyspark.client import ClarifaiPySpark
from pyspark.sql import SparkSession

CLARIFAI_PAT = dbutils.secrets.get(scope="clarifai", key="clarifai-pat")

### Creating ClarifaiPyspark object & creating/fetching image dataset from app

In [None]:
cspark_obj = ClarifaiPySpark(user_id='mansi_k', app_id='databricks_tester_img', pat=CLARIFAI_PAT)

dataset_obj = cspark_obj.dataset(dataset_id='dataset1')

### Fetching current inputs from image dataset

In [None]:
my_inputs = list(dataset_obj.list_inputs(input_type='image'))
my_inputs

[id: "c10"
 data {
   image {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_img/inputs/image/c1b5a9392f89cf42deca348758a63c79"
     hosted {
       prefix: "https://data.clarifai.com"
       suffix: "users/mansi_k/apps/databricks_tester_img/inputs/image/c1b5a9392f89cf42deca348758a63c79"
       sizes: "orig"
       sizes: "tiny"
       sizes: "small"
       sizes: "large"
       crossorigin: "use-credentials"
     }
     image_info {
       width: 1000
       height: 665
       format: "JPEG"
       color_mode: "YUV"
     }
   }
   concepts {
     id: "id-cat"
     name: "cat"
     value: 1.0
     app_id: "databricks_tester_img"
   }
 }
 created_at {
   seconds: 1698049660
   nanos: 961070000
 }
 modified_at {
   seconds: 1698049664
   nanos: 89517000
 }
 status {
   code: INPUT_DOWNLOAD_SUCCESS
   description: "Download complete"
 },
 id: "c3"
 data {
   image {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_img/inputs/im

### Upload images from volume folder to Clarifai dataset

In [None]:
dataset_obj.upload_dataset_from_folder(folder_path='/Volumes/mansi_test/default/cat', input_type='image', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]Uploading inputs: 100%|██████████| 1/1 [00:05<00:00,  5.36s/it]Uploading inputs: 100%|██████████| 1/1 [00:05<00:00,  5.36s/it]


### Export image annotations to spark dataframe

In [None]:
annot_df = dataset_obj.export_annotations_to_dataframe()
annot_df

DataFrame[annotation: string, annotation_created_at: string, annotation_id: string, annotation_modified_at: string, annotation_user_id: string, input_id: string]

### Write annotations dataframe to volume as delta table

In [None]:
annot_df.write.format("delta").mode("overwrite").save("/Volumes/mansi_test/default/test_vol1/imgAnnsDeltaTable2")


### Load annotations delta table from volume

In [None]:
spark = SparkSession.builder.appName("Clarifai-pyspark").getOrCreate()
spark.conf.set("spark.databricks.agent.id", "clarifaipyspark")
df_delta = spark.read.format("delta").load("/Volumes/mansi_test/default/test_vol1/imgAnnsDeltaTable2")
df_delta.show()

+--------------------+---------------------+--------------------+----------------------+------------------+--------+
|          annotation|annotation_created_at|       annotation_id|annotation_modified_at|annotation_user_id|input_id|
+--------------------+---------------------+--------------------+----------------------+------------------+--------+
|concepts {\n  id:...|  10/23/% 08:27:   %5|b3847d13486f4f118...|   10/23/% 08:27:   %5|           mansi_k|      c2|
|concepts {\n  id:...|  10/23/% 08:27:   %5|f65d75b3446b40ffa...|   10/23/% 08:27:   %5|           mansi_k|     c11|
|concepts {\n  id:...|  11/17/% 10:41:   %5|76ac2fcb998440bc8...|   11/17/% 10:41:   %5|           mansi_k|      c8|
|concepts {\n  id:...|  11/17/% 10:41:   %5|7edf0bc509454a9ca...|   11/17/% 10:41:   %5|           mansi_k|      c4|
|concepts {\n  id:...|  10/23/% 08:27:   %5|8fd99dc05eef46849...|   10/23/% 08:27:   %5|           mansi_k|      c1|
|concepts {\n  id:...|  11/17/% 10:41:   %5|f044bc745f754544b...

### Loop through delta table rows & columns

In [None]:
for row in df_delta.collect()[:4]:
    print(row['annotation_id'])
    print(row['input_id'])
    print(row['annotation_user_id'])
    print(row['annotation_created_at'])
    print(row['annotation_modified_at'])
    print(row['annotation'])
    print("########################\n")

b3847d13486f4f1189315ad4565221db
c2
mansi_k
10/23/% 08:27:   %5
10/23/% 08:27:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################

f65d75b3446b40ffa012735a31dcbb62
c11
mansi_k
10/23/% 08:27:   %5
10/23/% 08:27:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################

76ac2fcb998440bc8d2a5fffd3a8ae1a
c8
mansi_k
11/17/% 10:41:   %5
11/17/% 10:41:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################

7edf0bc509454a9caa74ac15d5604211
c4
mansi_k
11/17/% 10:41:   %5
11/17/% 10:41:   %5
concepts {
  id: "id-cat"
  name: "cat"
  value: 1.0
  app_id: "databricks_tester_img"
}

########################



### Testing export_images_to_volume()

In [None]:
dataset_obj.export_images_to_volume(path="/Volumes/mansi_test/default/test_vol1", input_response=my_inputs[:5])

### Load image csv file & upload text data into app using dataframe

In [None]:
spark = SparkSession.builder.appName("Clarifai-pyspark").getOrCreate()
spark.conf.set("spark.databricks.agent.id", "clarifaipyspark")
df = spark.read.option("header",True).csv("/Volumes/mansi_test/default/test_vol1/img_data2.csv")
df.show()

+-------+--------------------+--------+
|inputid|               input|concepts|
+-------+--------------------+--------+
|  img11|https://img.freep...|     dog|
|  img21|https://images.un...|    duck|
|  img31|https://mymodernm...|    duck|
+-------+--------------------+--------+



In [None]:
dataset_obj.upload_dataset_from_dataframe(dataframe=df, input_type='image', df_type='url', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]Uploading inputs: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]Uploading inputs: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]


### Using a custom dataloader to upload images from Volume

In [None]:
# Form a dataloader
from clarifai.datasets.upload.utils import load_module_dataloader

food_dataloader = load_module_dataloader(module_dir="/Volumes/mansi_test/default/test_vol1/food-101")

In [None]:
dataset_obj.upload_dataset_from_dataloader(dataloader=food_dataloader)

Uploading Dataset:   0%|          | 0/1 [00:00<?, ?it/s]Uploading Dataset: 100%|██████████| 1/1 [00:02<00:00,  2.63s/it]Uploading Dataset: 100%|██████████| 1/1 [00:02<00:00,  2.63s/it]


### Creating ClarifaiPyspark object & creating/fetching text dataset from app

In [None]:
cspark_obj = ClarifaiPySpark(user_id='mansi_k', app_id='databricks_tester_txt', pat=CLARIFAI_PAT)

dataset_obj = cspark_obj.dataset(dataset_id='dataset1')

### Fetching current inputs from text dataset

In [None]:
my_inputs = list(dataset_obj.list_inputs(input_type='text'))

my_inputs

[id: "XFmGD0xHlNXgGIXF"
 data {
   text {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_txt/inputs/text/65c1aa5487711a23f7477200fd01e253"
     hosted {
       prefix: "https://data.clarifai.com"
       suffix: "users/mansi_k/apps/databricks_tester_txt/inputs/text/65c1aa5487711a23f7477200fd01e253"
       sizes: "orig"
       crossorigin: "use-credentials"
     }
     text_info {
       char_count: 3
       encoding: "UTF8"
     }
   }
 }
 created_at {
   seconds: 1698678263
   nanos: 736342000
 }
 modified_at {
   seconds: 1698678273
   nanos: 49546000
 }
 status {
   code: INPUT_DOWNLOAD_SUCCESS
   description: "Download complete"
 },
 id: "Ak1n8DZ1l1RWKATv"
 data {
   text {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_txt/inputs/text/5a8dd3ad0756a93ded72b823b19dd877"
     hosted {
       prefix: "https://data.clarifai.com"
       suffix: "users/mansi_k/apps/databricks_tester_txt/inputs/text/5a8dd3ad0756a93ded72b823b19

### Testing export_text_to_volume()

Q- Create folders acc to concept name?   
Q- Save text as csv?

In [None]:
dataset_obj.export_text_to_volume(path="/Volumes/mansi_test/default/test_vol1", input_response=my_inputs)

### Upload text from volume csv to Clarifai dataset

In [None]:
dataset_obj.upload_dataset_from_csv(csv_path='/Volumes/mansi_test/default/test_vol1/emotions_data1.csv', input_type='text', labels=True, csv_type='raw')

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]Uploading inputs: 100%|██████████| 1/1 [00:20<00:00, 20.56s/it]Uploading inputs: 100%|██████████| 1/1 [00:20<00:00, 20.56s/it]


### Load test csv file & upload text data into app using dataframe

In [None]:
spark = SparkSession.builder.appName("Clarifai-pyspark").getOrCreate()
spark.conf.set("spark.databricks.agent.id", "clarifaipyspark")
df = spark.read.option("header",True).csv("/Volumes/mansi_test/default/test_vol1/emotions_data1.csv")
df.show()

+-------+--------------------+--------+
|inputid|               input|concepts|
+-------+--------------------+--------+
|    t11|   I am feeling good|   happy|
|    t21|This is a guava tree| neutral|
|    t31|This is a saddeni...|     sad|
+-------+--------------------+--------+



In [None]:
dataset_obj.upload_dataset_from_dataframe(dataframe=df, input_type='text', df_type='raw', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  5.48it/s]Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  5.45it/s]
