### Create Gemini Batch files

#### Schemas

In [None]:
final_output_schema = {
  "type": "object",
  "properties": {
    "entities": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "entity": {
            "type": "string",
            "description": "The name of the entity."
          },
          "attributes": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "attribute": {
                  "type": "string",
                  "description": "The attribute key."
                },
                "value": {
                  "type": "string",
                  "description": "The value of the attribute."
                }
              },
              "required": ["attribute", "value"]
            },
            "description": "A list of attributes for the entity."
          }
        },
        "required": ["entity", "attributes"]
      },
      "description": "A list of entities."
    },
    "relationships": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "relationship": {
            "type": "string",
            "description": "The type of relationship."
          },
          "reasoning": {
            "type": "string",
            "description": "Reasoning behind the relationship."
          }
        },
        "required": ["relationship", "reasoning"]
      },
      "description": "A list of relationships."
    },
    "geospatial_information": {
      "type": "object",
      "properties": {
        "country": {
          "type": "string",
          "description": "The country."
        },
        "state_or_province": {
          "type": "string",
          "description": "The state or province."
        },
        "city": {
          "type": "string",
          "description": "The city."
        },
        "reasoning": {
          "type": "string",
          "description": "Reasoning for the geospatial information."
        }
      },
      "required": ["country", "state_or_province", "city", "reasoning"]
    },
    "temporal_information": {
      "type": "object",
      "properties": {
        "century": {
          "type": "string",
          "description": "The century."
        },
        "decade": {
          "type": "string",
          "description": "The decade."
        },
        "year": {
          "type": "string",
          "description": "The year."
        },
        "month": {
          "type": "string",
          "description": "The month."
        },
        "day": {
          "type": "string",
          "description": "The day."
        },
        "time_of_day": {
          "type": "string",
          "description": "The time of day in the temporal information."
        },
        "reasoning": {
          "type": "string",
          "description": "Reasoning for the temporal information."
        }
      },
      "required": ["century", "decade", "year", "month", "day", "time_of_day", "reasoning"]
    },
    "event_information": {
      "type": "object",
      "properties": {
        "event": {
          "type": "object",
          "properties": {
            "value": {
              "type": "string",
              "description": "The value of the event."
            },
            "reasoning": {
              "type": "string",
              "description": "Reasoning for the event information."
            }
          },
          "required": ["value", "reasoning"]
        },
        "background": {
          "type": "object",
          "properties": {
            "value": {
              "type": "string",
              "description": "The background value of the event."
            },
            "reasoning": {
              "type": "string",
              "description": "Reasoning for the background information."
            }
          },
          "required": ["value", "reasoning"]
        }
      },
      "required": ["event", "background"]
    }
  },
  "required": [
    "entities",
    "relationships",
    "geospatial_information",
    "temporal_information",
    "event_information"
  ]
}

scene_graph_schema = {
  "type": "object",
  "properties": {
    "entities": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "entity": {
            "type": "string",
            "description": "The name of the entity."
          },
          "attributes": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "attribute": {
                  "type": "string",
                  "description": "The attribute key."
                },
                "value": {
                  "type": "string",
                  "description": "The value of the attribute."
                }
              },
              "required": ["attribute", "value"]
            },
            "description": "A list of attributes for the entity."
          }
        },
        "required": ["entity", "attributes"]
      },
      "description": "A list of entities."
    },
    "relationships": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "relationship": {
            "type": "string",
            "description": "The type of relationship."
          },
          "reasoning": {
            "type": "string",
            "description": "Reasoning behind the relationship."
          }
        },
        "required": ["relationship", "reasoning"]
      },
      "description": "A list of relationships."
    }
  },
  "required": ["entities", "relationships"]
}

abstract_schema = {
  "type": "object",
  "properties": {
    "abstract_idea": {
      "type": "object",
      "properties": {
        "idea": {
          "type": "string",
          "description": "The value of the idea."
        },
        "reasoning": {
          "type": "string",
          "description": "Reasoning behind the idea."
        }
      },
      "required": ["idea", "reasoning"],
      "description": "The abstract idea represented by an idea and its reasoning."
    }
  },
  "required": ["abstract_idea"]
}

gold_meta_schema = {
  "type": "object",
  "properties": {
    "entities": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "entity": {
            "type": "string",
            "description": "The name of the entity."
          },
          "attributes": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "attribute": {
                  "type": "string",
                  "description": "The attribute key."
                },
                "value": {
                  "type": "string",
                  "description": "The value of the attribute."
                }
              },
              "required": ["attribute", "value"]
            },
            "description": "A list of attributes associated with the entity."
          }
        },
        "required": ["entity", "attributes"]
      },
      "description": "A list of entities, each with a name and a set of attributes."
    },
    "temporal_details": {
      "type": "object",
      "properties": {
        "century": {
          "type": "string",
          "description": "The century of the temporal details."
        },
        "decade": {
          "type": "string",
          "description": "The decade of the temporal details."
        },
        "year": {
          "type": "string",
          "description": "The year of the temporal details."
        },
        "month": {
          "type": "string",
          "description": "The month of the temporal details."
        },
        "day": {
          "type": "string",
          "descriptio,n": "The day of the temporal details."
        },
        "time_of_day": {
          "type": "string",
          "description": "The time of day in the temporal information."
        },
      },
      "required": ["century", "decade", "year", "month", "day", "time_of_day"],
      "description": "Details about the temporal aspects, including century, decade, year, month, and day."
    },
    "spatial_details": {
      "type": "object",
      "properties": {
        "country": {
          "type": "string",
          "description": "The country of the location."
        },
        "state_or_province": {
          "type": "string",
          "description": "The state or province of the location."
        },
        "city": {
          "type": "string",
          "description": "The city of the location."
        }
      },
      "required": ["country", "state_or_province", "city"],
      "description": "Details about the spatial aspects, including country, state/province, and city."
    }
  },
  "required": ["entities", "temporal_details", "spatial_details"]
}

prompt_schema = {
  "type": "object",
  "properties": {
    "global_event_specialist": {
      "type": "object",
      "properties": {
        "prompt": {
          "type": "string",
          "description": "The prompt for the global event specialist."
        },
        "reasoning": {
          "type": "string",
          "description": "Reasoning for the global event specialist prompt."
        }
      },
      "required": ["prompt", "reasoning"],
      "description": "Information about the global event specialist prompt."
    },
    "temporal_specialist": {
      "type": "object",
      "properties": {
        "prompt": {
          "type": "string",
          "description": "The prompt for the temporal specialist."
        },
        "reasoning": {
          "type": "string",
          "description": "Reasoning for the temporal specialist prompt."
        }
      },
      "required": ["prompt", "reasoning"],
      "description": "Information about the temporal specialist prompt."
    },
    "spatial_specialist": {
      "type": "object",
      "properties": {
        "prompt": {
          "type": "string",
          "description": "The prompt for the spatial specialist."
        },
        "reasoning": {
          "type": "string",
          "description": "Reasoning for the spatial specialist prompt."
        }
      },
      "required": ["prompt", "reasoning"],
      "description": "Information about the spatial specialist prompt."
    }
  },
  "required": ["global_event_specialist", "temporal_specialist", "spatial_specialist"]
}

global_event_schema = {
  "type": "object",
  "properties": {
    "event": {
      "type": "object",
      "properties": {
        "value": {
          "type": "string",
          "description": "The value of the event."
        },
        "reasoning": {
          "type": "string",
          "description": "The reasoning behind the event."
        }
      },
      "required": ["value", "reasoning"],
      "description": "Details about the event."
    },
    "background": {
      "type": "object",
      "properties": {
        "value": {
          "type": "string",
          "description": "The background value of the event."
        },
        "reasoning": {
          "type": "string",
          "description": "The reasoning for the background."
        }
      },
      "required": ["value", "reasoning"],
      "description": "Details about the background of the event."
    }
  },
  "required": ["event", "background"],
  "description": "Schema representing a global event and its background reasoning."
}

temporal_schema = {
  "type": "object",
  "properties": {
    "century": {
      "type": "string",
      "description": "The century of the temporal information."
    },
    "decade": {
      "type": "string",
      "description": "The decade of the temporal information."
    },
    "year": {
      "type": "string",
      "description": "The year of the temporal information."
    },
    "month": {
      "type": "string",
      "description": "The month of the temporal information."
    },
    "day": {
      "type": "string",
      "description": "The day of the temporal information."
    },
     "time_of_day": {
      "type": "string",
      "description": "The time of day in the temporal information."
    },
    "reasoning": {
      "type": "string",
      "description": "The reasoning behind the temporal information."
    }
  },
  "required": ["century", "decade", "year", "month", "day", "reasoning", "time_of_day"],
  "description": "Schema representing temporal information, including reasoning."
}

label_schema = {
  "type": "object",
  "properties": {
    "label" : {"type" :"string", "decription" : "The label of a string."}
  },
  "required": ["label"],
  "description": "Schema representing label information."
}



#### Create messages

In [3]:
import json
def create_messages(img, obj=None, prompt=None):
    data = {
    "request": {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {
"text": f'''The annotation category titled **‘label’** is the primary category in our dataset. It represents the level of hostility (hate) expressed by an individual towards the LGBTQ+ community.

This category is divided into three classes, described below:

1. **Extremely Hateful**: These comments exhibit overt and unwarranted hate, often using derogatory language. They tend to challenge the very identity of the LGBTQ+ community without offering logical arguments, and may also include harsh language—even if it appears to support the community.

2. **Mild Hate**: These comments may indirectly reference LGBTQ+ individuals, either in support or opposition. They might acknowledge the community but still express discomfort with its fundamental identity.

3. **No Hate**: These comments are either supportive of the LGBTQ+ community or are neutral, with no discernible intent of hostility.

You must format your output in the required JSON format. If a label cannot be reasonably determined, return `"label": "NA"`.''',
},
{
"text": f'''Below is a Hindi-language comment written in the Latin (English) script, made in response to an LGBTQ+ awareness movie: {img}

Your task is to classify the comment using one of the following labels:
- Extremely Hateful
- Mild Hate
- No Hate

Please return your answer in the following JSON format:
{"label": ""}''',
}

                ]
            }
        ]
        ,
        "generationConfig": {
            "response_mime_type": "application/json", 
            "response_schema": label_schema,
            "max_output_tokens" : 50
        }
    }
}
    if obj:
        data["request"]["contents"][0]["parts"].append({
            "text" : f"Scene Graph: {json.dumps(obj, ensure_ascii=False)}",
        })
    if prompt:
        data["request"]["contents"][0]["parts"].append({
            "text" : f"Prompt: {prompt}",
        })
    return data

In [31]:
import json
import unicodedata

images = [str(i) + ".jpg" for i in range(6296)]
# with open("../all_images.txt", "r") as f:
#     images = [unicodedata.normalize("NFD",line.strip()) for line in f]

# images = [json.load(open(f"../assets/TARA json/{img.strip().replace('.jpg', '.json')}"))["image_url"] for img in images]

with open("outputs/wikitilo/ordered/scene_graph_output.jsonl", "r") as f:
    objs = [json.loads(line) for line in f]

with open("outputs/wikitilo/ordered/abstract_output.jsonl", "r") as f:
    abs = [json.loads(line) for line in f]

with open("outputs/wikitilo/ordered/temporal_1_prediction_ordered.jsonl", "r") as f:
    temps = [json.loads(line) for line in f]

with open("outputs/wikitilo/ordered/geospatial_1_prediction_ordered.jsonl", "r") as f:
    geos = [json.loads(line) for line in f]

with open("outputs/wikitilo/ordered/global_event_1_prediction_ordered.jsonl", "r") as f:
    globevs = [json.loads(line) for line in f]

for ab, obj, geo, temp, glob, in zip(abs, 
                                     objs, 
                                     geos, 
                                     temps, 
                                     globevs
                                     ):
    for k, v in ab["response"].items():
        try:
            obj["response"][k] = v
        except Exception as e:
            print("Abstract",ab, obj ,e)
    try:
        obj["response"]["temporal_information"] = temp["response"]
    except Exception as e:
            print("Temporal", temp, obj, e)
    try:
        obj["response"]["geospatial_information"] = temp["response"]
    except Exception as e:
            print("Geospatial", temp, obj, e)
    # for k, v in glob["response"].items():
    #     try:
    #         obj["response"][k] = v 
    #     except Exception as e:
    #         print("Global Event",geo, obj ,e)

with open("outputs/wikitilo/ordered/prompt_prediction_ordered.jsonl", "r") as f:
    prs = [json.loads(line) for line in f]


i = 0
with open("messages/global_event_2_input.jsonl", "w") as f:
    for img, obj, pr in zip(images, 
                        objs,
                        prs
                        ):
        # print(i)
        f.write(json.dumps(create_messages(img=img,
                                            obj=obj["response"],
                                            prompt=pr["response"]["global_event_specialist"]["prompt"]
                                            ),
                                            ensure_ascii=False)  + "\n")
        i += 1

### Run Batch

In [None]:
import vertexai
from vertexai.batch_prediction import BatchPredictionJob
import time

# Initialize Vertex AI
vertexai.init(project="", location="")
names = ["edis"]
dataset = "wikitilo"
for name in names:
    batch_input_bucket = f"wikitilo"
    batch_output_bucket = f"wikitilo"
    input_uri = "gs://" + batch_input_bucket + f"/edis_errors.jsonl"
    output_uri = "gs://" + batch_output_bucket

    # print(input_uri, output_uri)

    files = []
    # Submit a batch prediction job with Gemini model
    files.append(BatchPredictionJob.submit(
        source_model="gemini-1.5-pro-002",
        input_dataset=input_uri,
        output_uri_prefix=output_uri,

    )) 
    # batch_prediction_job = BatchPredictionJob.submit(
    #     source_model="gemini-1.5-pro-002",
    #     input_dataset=input_uri,
    #     output_uri_prefix=output_uri,
    # )

    # Check job status
    # print(f"Job resource name: {batch_prediction_job.resource_name}")
    # print(f"Model resource name with the job: {batch_prediction_job.model_name}")
    # print(f"Job state: {batch_prediction_job.state.name}")

    # # Refresh the job until complete
    # while not batch_prediction_job.has_ended:
    #     time.sleep(5)
    #     batch_prediction_job.refresh()

    # # Check if the job succeeds
    # if batch_prediction_job.has_succeeded:
    #     print("Job succeeded!")
    # else:
    #     print(f"Job failed: {batch_prediction_job.error}")

    # # Check the location of the output
    # print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

BatchPredictionJob created. Resource name: projects/568815670585/locations/us-east1/batchPredictionJobs/8607722078797299712
To use this BatchPredictionJob in another session:
job = batch_prediction.BatchPredictionJob('projects/568815670585/locations/us-east1/batchPredictionJobs/8607722078797299712')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-east1/batch-predictions/8607722078797299712?project=568815670585


In [None]:
with open("outputs/wikitilo/ordered/scene_graph_errors_prediction_ordered.jsonl", "r+") as f:
    objs = [json.loads(line) for line in f]
    for obj in objs:
        obj["response"].pop("geospatial_information")
        obj["response"].pop("temporal_information")
        obj["response"].pop("event_information")
with open("outputs/wikitilo/ordered/scene_graph_output.jsonl", "w") as f:
    for obj in objs:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")