In [1]:
import gzip
import json
import jsonlines
import os
from tqdm.notebook import tqdm
import time

import google.generativeai as genai
genai.configure(api_key="")

In [2]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [3]:
model = genai.GenerativeModel('models/gemini-1.0-pro')

In [4]:
def call_gemini_api(input_text):
    
    response = model.generate_content(input_text.strip(), safety_settings={'HARASSMENT':'block_none',
                                                     'HATE_SPEECH': 'block_none',
                                                     'HARM_CATEGORY_DANGEROUS_CONTENT': 'block_none',
                                                     'HARM_CATEGORY_SEXUALLY_EXPLICIT': 'block_none'})
    try:
        return response.text
    except Exception as e:
        # print(response.prompt_feedback)
        return "UNSAFE"

In [5]:
instruction = """Give a concise summary for the below description of the product in the form {"summary": ...}.\n\nProduct Info:\n"""

In [6]:
def parse(path):
    g = gzip.open(path, "rb")
    for l in g:
        yield json.loads(l)

In [7]:
def get_info_from_sample(sample:dict):
    description = sample.get("description", None)
    if isinstance(description, list):
        description = " ".join(description)
        
    features = sample.get("features", None)
    if isinstance(features, list):
        features = "\n- " + "\n- ".join(features)
    
    if description and features:
        return f"""Description:\n{description}\nFeatures:{features}"""
    elif description:
        return description
    elif features:
        return features
    else:
        return None

In [8]:
sample = "Product Description\nThe best just got better. With proven 5th generation technology and form, the Motorola H700 Bluetooth Headset enhances the popular folding microphone design with superior audio performance, echo cancellation technology and an approximate 30% reduction in size. Discriminating consumers will welcome the Motorola H700 because it eliminates wires while delivering crisp, clear communications. With a 10-meter (30 feet) connectivity range, new levels of comfort and simple one-touch access to your most desired features, wireless has never looked so good.Designed for enhanced comfort while delivering a 30% reduction in size and weight from its predecessor, the super compact Motorola H700 delivers unbeatable comfort with an ergonomic ear hook that can be worn on either ear. With unsurpassed call quality and Motorola's exclusive PowerFlip design that lets you answer and end calls with ease, this headset optimizes on-the-go conversations - with style and simplicity.The Motorola H700 is the premium choice for those who are serious about wireless connectivity, offering up to six hours of talk time, 130 of standby time and charge time of less than one hour. Designed with your calls in mind, the headset features Bluetooth 1.2 wireless technology for better call quality, faster connections and less interference.Feature rich doesn't mean complicated. A blue light lets others know when you're on a call. The light changes colors to let you know when the headset is in pairing mode, charging or running low on battery power. The ability to connect with compatible devices up to 10 meters (30 feet) away helps you chat, sync and send - even when your mobile isn't in sight. Fashionable and functional, the Motorola H700 is the essential partner for serious mobile consumers."

In [9]:
mixtral_out = "The Motorola H700 Bluetooth Headset is a compact and comfortable wireless headset with superior audio performance, echo cancellation technology, and a 30% reduction in size. It features up to six hours of talk time, 130 of standby time, and charge time of less than one hour. The headset is designed with your calls in mind, featuring Bluetooth 1.2 wireless technology for better call quality, faster connections, and less interference. The Motorola H700 is a fashionable and functional essential partner for serious mobile consumers."

In [10]:
print(instruction+sample)

Give a concise summary for the below description of the product in the form {"summary": ...}.

Product Info:
Product Description
The best just got better. With proven 5th generation technology and form, the Motorola H700 Bluetooth Headset enhances the popular folding microphone design with superior audio performance, echo cancellation technology and an approximate 30% reduction in size. Discriminating consumers will welcome the Motorola H700 because it eliminates wires while delivering crisp, clear communications. With a 10-meter (30 feet) connectivity range, new levels of comfort and simple one-touch access to your most desired features, wireless has never looked so good.Designed for enhanced comfort while delivering a 30% reduction in size and weight from its predecessor, the super compact Motorola H700 delivers unbeatable comfort with an ergonomic ear hook that can be worn on either ear. With unsurpassed call quality and Motorola's exclusive PowerFlip design that lets you answer and

In [11]:
print(call_gemini_api(instruction+sample))

{"summary": "The Motorola H700 Bluetooth Headset offers enhanced comfort and performance with a 30% size reduction. It provides crisp audio, echo cancellation, and a 10-meter connectivity range. Featuring a one-touch answer button, ergonomic ear hook, and PowerFlip design, it optimizes on-the-go conversations. With up to six hours of talk time, 130 hours of standby time, and Bluetooth 1.2 technology, it ensures reliable and interference-free connections."}


In [12]:
def get_labelled_data(input_file_path, out_dir, skiplines=0, filename=None):
    inp_file_name = input_file_path.split("/")[-1].strip()
    with jsonlines.open(os.path.join(out_dir, f'{inp_file_name.split(".")[0]}.jsonl'), mode="a") as out_file:
        print(f"Processing file - {input_file_path}")
        ct = 0
        pred_ct = 0
        for sample in parse(input_file_path):
            ct+=1
            if filename and filename==inp_file_name and ct<skiplines:
                continue
            if ct and ct%1000 == 0: print(f"\nProcessed {ct} products so far. Additional/Total labelled - {pred_ct} samples...")

            info = get_info_from_sample(sample)
            if info and len(info.split())>50:
                try:
                    generated_text = call_gemini_api(instruction+info)
                    if generated_text!="UNSAFE": out_file.write({"product_info": info, "summary": json.loads(generated_text.strip())["summary"]})
                    pred_ct+=1
                    # generated_text=""
                except Exception as e:
                    # print(e)
                    # if generated_text: print(generated_text)
                    if "deadline" in str(e).lower(): 
                        print("resetting the limit")
                        time.sleep(60)
    return pred_ct

In [None]:
get_labelled_data("data/raw_compressed/metadata/train/meta_Electronics.json.gz", "data/labelled/metadata/train", 154000, "meta_Electronics.json.gz")

In [None]:
#SBATCH --job-name=create_labels_nlp_project
#SBATCH --partition=gpu
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --mem=16G
#SBATCH --gres=gpu:v100-sxm2:1
#SBATCH --time=6:00:00
#SBATCH -o %J.log
#SBATCH -e %J.log