In [1]:
import gzip
import json
import jsonlines
import os
from tqdm.notebook import tqdm
import time
import pandas as pd

import google.generativeai as genai
genai.configure(api_key="")

In [2]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [3]:
model = genai.GenerativeModel('models/gemini-1.0-pro')

In [4]:
def call_gemini_api(input_text):
    
    response = model.generate_content(input_text.strip(), safety_settings={'HARASSMENT':'block_none',
                                                     'HATE_SPEECH': 'block_none',
                                                     'HARM_CATEGORY_DANGEROUS_CONTENT': 'block_none',
                                                     'HARM_CATEGORY_SEXUALLY_EXPLICIT': 'block_none'})
    try:
        return response.text
    except Exception as e:
        # print(response.prompt_feedback)
        return "UNSAFE"

In [5]:
# instruction = """Below are the latest reviews of a product. Please examine it and compile a JSON object with "pros" and "cons," each section having a few succinct points, and a "verdict". You must only output the json with the format: {"pros": [...], "cons": [...], "verdict": ...}.\n\nReview(s):\n"""

In [6]:
instruction = """Here are the most recent reviews of a product. Analyze these and summarize in your own words to create a json containing pros, cons (each containing less than or equal to 5 concise points) and final verdict (overall review of this) in the form {"pros": [...], "cons": [...], "verdict": ...}\n\nReviews:\n"""

In [7]:
def parse(path):
    g = gzip.open(path, "rb")
    for l in g:
        yield json.loads(l)

In [8]:
reviews_path = "data/raw_compressed/reviews_5core"
labelled_path = "data/labelled/reviews/train"
batch_size = 10

In [9]:
for review_file in os.listdir(reviews_path):
    with jsonlines.open(os.path.join(labelled_path, f'{review_file.split(".")[0]}.jsonl'), mode="a") as out_file:
        data = {"asin": [], "reviewText": []}
        for i in parse(os.path.join(reviews_path, review_file)):
            if i.get('asin', None) and i.get('reviewText', None):
                data['asin'].append(i['asin'])
                data['reviewText'].append(i['reviewText'])
                
        grouped_df = pd.DataFrame(data).groupby("asin")
        
        for asin, reviews_df in grouped_df:
            reviews = reviews_df['reviewText'].values.tolist()
            for i in range(0, len(reviews), batch_size):
                reviews_ = "- "+"\n- ".join(reviews[i:i+batch_size])
                gemini_inp = instruction+reviews_+"\n\nOutput JSON:"
                
                try:
                    generated_text = call_gemini_api(gemini_inp)
                    if generated_text!="UNSAFE":
                        out_json = json.loads(generated_text.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip())
                        if "pros" in out_json and "cons" in out_json and "verdict" in out_json:
                            out_file.write({"review_info": reviews_, "pros": out_json["pros"], "cons": out_json["cons"], "verdict": out_json["verdict"]})
                except Exception as e:
                    # print(e)
                    # print(generated_text)
                    if "deadline" in str(e).lower(): 
                        print("resetting the limit")
                        time.sleep(60)
                    
                
    

resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
resetting the limit
