In [None]:
!pip install clarifaipyspark
!pip install protobuf==4.24.2

# Testing ClarifaiPySpark SDK

### Setting Env variables

In [None]:
import os
from clarifaipyspark.client import ClarifaiPySpark

os.environ['CLARIFAI_PAT'] = ''

### Creating ClarifaiPyspark object & creating/fetching image dataset from app

In [None]:
cspark_obj = ClarifaiPySpark(user_id='mansi_k', app_id='databricks_tester_img')

dataset_obj = cspark_obj.dataset(dataset_id='dataset1')

### Fetching current inputs from image dataset

In [None]:
my_inputs = list(dataset_obj.list_inputs())
my_inputs

[id: "dataset1-train-3647386"
 data {
   image {
     url: "https://data.clarifai.com/orig/users/mansi_k/apps/databricks_tester_img/inputs/image/1249d3d8c8157d1b2dcdb2561175374a"
     hosted {
       prefix: "https://data.clarifai.com"
       suffix: "users/mansi_k/apps/databricks_tester_img/inputs/image/1249d3d8c8157d1b2dcdb2561175374a"
       sizes: "orig"
       sizes: "tiny"
       sizes: "small"
       sizes: "large"
       crossorigin: "use-credentials"
     }
     image_info {
       width: 512
       height: 288
       format: "JPEG"
       color_mode: "YUV"
     }
   }
   concepts {
     id: "id-hamburger"
     name: "hamburger"
     value: 1
     app_id: "databricks_tester_img"
   }
 }
 created_at {
   seconds: 1698052012
   nanos: 477555000
 }
 modified_at {
   seconds: 1698052013
   nanos: 977120000
 }
 status {
   code: INPUT_DOWNLOAD_SUCCESS
   description: "Download complete"
 },
 id: "dataset1-train-3520891"
 data {
   image {
     url: "https://data.clarifai.com/orig/u

### Upload images from volume folder to Clarifai dataset

In [None]:
dataset_obj.upload_dataset_from_folder(folder_path='/Volumes/mansi_test/default/cat', input_type='image', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]Uploading inputs: 100%|██████████| 1/1 [00:05<00:00,  5.36s/it]Uploading inputs: 100%|██████████| 1/1 [00:05<00:00,  5.36s/it]


### Export image annotations to spark dataframe

In [None]:
annot_df = dataset_obj.export_annotations_to_dataframe()
annot_df

DataFrame[annotation: map<string,array<map<string,string>>>, created_at: string, id: string, input_id: string, modified_at: string, user_id: string]

### Write annotations dataframe to volume as delta table

In [None]:
annot_df.write.format("delta").mode("overwrite").save("/Volumes/mansi_test/default/test_vol1/imgAnnsDeltaTable1") 


### Load annotations delta table from volume

In [None]:
df_delta = spark.read.format("delta").load("/Volumes/mansi_test/default/test_vol1/imgAnnsDeltaTable1")
df_delta.show()

+--------------------+-------------------+--------------------+--------+-------------------+-------+
|          annotation|         created_at|                  id|input_id|        modified_at|user_id|
+--------------------+-------------------+--------------------+--------+-------------------+-------+
|{concepts -> [{na...|10/23/% 08:27:   %5|da7eb3483a654a42a...|      c5|10/23/% 08:27:   %5|mansi_k|
|{concepts -> [{na...|10/23/% 08:27:   %5|f65d75b3446b40ffa...|     c11|10/23/% 08:27:   %5|mansi_k|
|{concepts -> [{na...|10/23/% 08:27:   %5|8fd99dc05eef46849...|      c1|10/23/% 08:27:   %5|mansi_k|
|{concepts -> [{na...|10/23/% 08:27:   %5|9426763104ea42fbb...|      c7|10/23/% 08:27:   %5|mansi_k|
|{concepts -> [{na...|10/23/% 08:27:   %5|d93bb8387f6743b8a...|      c8|10/23/% 08:27:   %5|mansi_k|
|{concepts -> [{na...|10/23/% 08:27:   %5|aed44152eaad4fbcb...|      c6|10/23/% 08:27:   %5|mansi_k|
|{concepts -> [{na...|10/23/% 08:27:   %5|b3847d13486f4f118...|      c2|10/23/% 08:27:   %5

### Loop through delta table rows & columns

In [None]:
for row in df_delta.collect()[:4]:
    print(row['id'])
    print(row['input_id'])
    print(row['user_id'])
    print(row['created_at'])
    print(row['input_id'])
    print(row['annotation'])
    print("########################\n")

da7eb3483a654a42ace1ebac76a69fdc
c5
mansi_k
10/23/% 08:27:   %5
c5
{'concepts': [{'name': 'cat', 'id': 'id-cat', 'value': '1.0', 'appId': 'databricks_tester_img'}]}
########################

f65d75b3446b40ffa012735a31dcbb62
c11
mansi_k
10/23/% 08:27:   %5
c11
{'concepts': [{'name': 'cat', 'id': 'id-cat', 'value': '1.0', 'appId': 'databricks_tester_img'}]}
########################

8fd99dc05eef468492623153531700c9
c1
mansi_k
10/23/% 08:27:   %5
c1
{'concepts': [{'name': 'cat', 'id': 'id-cat', 'value': '1.0', 'appId': 'databricks_tester_img'}]}
########################

9426763104ea42fbb44ff73677fd3806
c7
mansi_k
10/23/% 08:27:   %5
c7
{'concepts': [{'name': 'cat', 'id': 'id-cat', 'value': '1.0', 'appId': 'databricks_tester_img'}]}
########################



### Testing export_images_to_volume()

In [None]:
dataset_obj.export_images_to_volume(path="/Volumes/mansi_test/default/test_vol1", input_response=my_inputs[:5])

### Load image csv file & upload text data into app using dataframe

In [None]:
spark = SparkSession.builder.appName("Clarifai-spark").getOrCreate()
df = spark.read.option("header",True).csv("/Volumes/mansi_test/default/test_vol1/img_data2.csv")
df.show()

+-------+--------------------+--------+
|inputid|               input|concepts|
+-------+--------------------+--------+
|  img11|https://img.freep...|     dog|
|  img21|https://images.un...|    duck|
|  img31|https://mymodernm...|    duck|
+-------+--------------------+--------+



In [None]:
dataset_obj.upload_dataset_from_dataframe(dataframe=df, input_type='image', df_type='url', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  4.48it/s]


### Using a custom dataloader to upload images from Volume

In [None]:
dataset_obj.upload_dataset_from_dataloader(task="visual-classification", split="train", module_dir="/Volumes/mansi_test/default/test_vol1/food-101")

Uploading Dataset:   0%|          | 0/1 [00:00<?, ?it/s]Uploading Dataset: 100%|██████████| 1/1 [00:02<00:00,  2.61s/it]Uploading Dataset: 100%|██████████| 1/1 [00:02<00:00,  2.61s/it]


### Creating ClarifaiPyspark object & creating/fetching text dataset from app

In [None]:
cspark_obj = ClarifaiPySpark(user_id='mansi_k', app_id='databricks_tester_txt')

dataset_obj = cspark_obj.dataset(dataset_id='dataset1')

Creating a new dataset


### Fetching current inputs from text dataset

In [None]:
my_inputs = dataset_obj.list_inputs()

my_inputs

<generator object Inputs.list_inputs at 0x7f92a88c6570>

### Testing export_text_to_volume()

Q- Create folders acc to concept name?   
Q- Save text as csv?

In [None]:
dataset_obj.export_text_to_volume(path="/Volumes/mansi_test/default/test_vol1", input_response=my_inputs)

### Upload text from volume csv to Clarifai dataset

In [None]:
dataset_obj.upload_dataset_from_csv(csv_path='/Volumes/mansi_test/default/test_vol1/emotions_data1.csv', input_type='text', labels=True, csv_type='raw')

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]Uploading inputs: 100%|██████████| 1/1 [00:20<00:00, 20.56s/it]Uploading inputs: 100%|██████████| 1/1 [00:20<00:00, 20.56s/it]


### Load test csv file & upload text data into app using dataframe

In [None]:
spark = SparkSession.builder.appName("Clarifai-spark").getOrCreate()
df = spark.read.option("header",True).csv("/Volumes/mansi_test/default/test_vol1/emotions_data1.csv")
df.show()

+-------+--------------------+--------+
|inputid|               input|concepts|
+-------+--------------------+--------+
|    t11|   I am feeling good|   happy|
|    t21|This is a guava tree| neutral|
|    t31|This is a saddeni...|     sad|
+-------+--------------------+--------+



In [None]:
dataset_obj.upload_dataset_from_dataframe(dataframe=df, input_type='text', df_type='raw', labels=True)

Uploading inputs:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  5.48it/s]Uploading inputs: 100%|██████████| 1/1 [00:00<00:00,  5.45it/s]
