### Working Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/
!ls

/content/drive/MyDrive
 amazon_alexa.tsv  'Colab Notebooks'


### Import Dataset

In [None]:
import pandas as pd

data = pd.read_csv('amazon_alexa.tsv', sep='\t')
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [None]:
mydata = data[['verified_reviews','feedback']]
mydata.columns = ['review','label']

mydata.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [None]:
mydata.value_counts('label')

label
1    2893
0     257
Name: count, dtype: int64

In [None]:
# Count the occurrences of each label
label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class
rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly
if rows_to_drop > 0:
   data_majority = mydata[mydata["label"] == 1]
   data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
   data_balanced = mydata.copy()

# Check the new class balance
print(data_balanced["label"].value_counts())

label
1    257
0    257
Name: count, dtype: int64


## Data Preprocessing

In [None]:
import pandas as pd
import re

def clean_text(text):
  if isinstance(text, str):  # Check if text is a string
    # Remove special characters and punctuation
    text = re.sub(r"[^\w\s]", " ", text)

    # Remove single characters
    text = re.sub(r"\b[a-zA-Z]\b", " ", text)

    # Remove HTML tags
    text = re.sub(r"<[^>]*>", " ", text)

    # Lowercase the text
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    # Trim leading and trailing spaces
    text = text.strip()

    return text
  else:
    return ""  # Return an empty string for non-string values

# Extract the review column as a list
reviews = data_balanced['review'].tolist()

# Clean the text in the list
cleaned_reviews = [clean_text(review) for review in reviews]

# Add the cleaned reviews as a new column to the DataFrame
data_balanced['clean_reviews'] = cleaned_reviews

In [None]:
data_balanced

Unnamed: 0,review,label,clean_reviews
40,My husband likes being able to use it to liste...,1,my husband likes being able to use it to liste...
42,So far so good,1,so far so good
46,"It's like Siri, in fact, Siri answers more acc...",0,it like siri in fact siri answers more accurat...
65,How easy if was to set up.,1,how easy if was to set up
67,You’re all I need...na na nana!,1,you re all need na na nana
...,...,...,...
3096,The product sounded the same as the emoji spea...,0,the product sounded the same as the emoji spea...
3110,"Love it! I personally prefer Spotify music, so...",1,love it personally prefer spotify music so it ...
3113,I love it. bought one for my daughter and one ...,1,love it bought one for my daughter and one for...
3126,,1,


## Data Split

In [None]:
import pandas as pd

# Assuming your DataFrame is called "df"
total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

## Sentiment w/ LLM

### Setting up Gemini API

In [None]:
!pip install -q -U google-generativeai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.3/718.3 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [None]:
model = genai.GenerativeModel('gemini-pro')

In [None]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 116 ms, sys: 11.6 ms, total: 128 ms
Wall time: 7.05 s


> The meaning of life is a philosophical question that has been pondered by humans for centuries. There is no one definitive answer that everyone can agree on, as the meaning of life is likely to be unique to each individual. However, there are some common themes that emerge when people discuss the meaning of life.
> 
> - **Finding purpose:** Many people believe that the meaning of life is found in finding a purpose or goal to work towards. This could be a career, a relationship, a hobby, or anything else that gives you a sense of meaning and fulfillment.
> 
> 
> - **Helping others:** Many people believe that the meaning of life is found in helping others. This could involve volunteering, donating to charity, or simply being there for your friends and family.
> 
> 
> - **Living in the moment:** Some people believe that the meaning of life is found in living in the present moment. This means appreciating the good times and not dwelling on the past or worrying about the future.
> 
> 
> - **Being yourself:** Many people believe that the meaning of life is found in being true to yourself. This means accepting who you are, and not trying to be someone you're not.
> 
> 
> Ultimately, the meaning of life is a personal question that each individual must answer for themselves. However, by exploring the different perspectives on this question, we can gain a better understanding of ourselves and what we want out of life.

#### Single API Call

In [None]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label'] = ''

test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
2842,"I reached out to Amazon, because the device wa...",0,reached out to amazon because the device wante...,
745,"Excelente, lo unico es que no esta en español.",1,excelente lo unico es que no esta en español,
1649,Works with Blink video cameras. Does not work...,1,works with blink video cameras does not work w...,
2876,,0,,
2994,Good speaker for bedroom or office,1,good speaker for bedroom or office,
202,Love this! Have it in the kitchen and works w...,1,love this have it in the kitchen and works well,
2582,Have the echo wanted something smaller in anot...,1,have the echo wanted something smaller in anot...,
3110,"Love it! I personally prefer Spotify music, so...",1,love it personally prefer spotify music so it ...,
1422,Waste of money over the echo generation 2. Yo...,0,waste of money over the echo generation 2 you ...,
2098,Got this a few months ago with the idea that i...,0,got this few months ago with the idea that it ...,


In [None]:
# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data
print(json_data)

[{"clean_reviews":"reached out to amazon because the device wanted to sync my phone number but it would not allow because it said my number was already in use customer service couldn help they basically told me to contact sprint to assist so echo does the bare minimum without access to my phone for set up so its kind of pointless to have and pay for","pred_label":""},{"clean_reviews":"excelente lo unico es que no esta en espa\u00f1ol","pred_label":""},{"clean_reviews":"works with blink video cameras does not work with you tube","pred_label":""},{"clean_reviews":"","pred_label":""},{"clean_reviews":"good speaker for bedroom or office","pred_label":""},{"clean_reviews":"love this have it in the kitchen and works well","pred_label":""},{"clean_reviews":"have the echo wanted something smaller in another room they work together great","pred_label":""},{"clean_reviews":"love it personally prefer spotify music so it great that was able to set that as default there not day that goes by without

In [None]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)


You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
[{"clean_reviews":"reached out to amazon because the device wanted to sync my phone number but it would not allow because it said my number was already in use customer service couldn help they basically told me to contact sprint to assist so echo does the bare minimum without access to my phone for set up so its kind of pointless to have and pay for","pred_label":""},{"clean_reviews":"excelente lo unico es que no esta en espa\u00f1ol","pred_label":""},{"clean_reviews":"works with blink video cameras does no

In [None]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_reviews":"reached out to amazon because the device wanted to sync my phone number but it would not allow because it said my number was already in use customer service couldn help they basically told me to contact sprint to assist so echo does the bare minimum without access to my phone for set up so its kind of pointless to have and pay for","pred_label":0},{"clean_reviews":"excelente lo unico es que no esta en espa\u00f1ol","pred_label":0},{"clean_reviews":"works with blink video cameras does not work with you tube","pred_label":0},{"clean_reviews":"","pred_label":0},{"clean_reviews":"good speaker for bedroom or office","pred_label":1},{"clean_reviews":"love this have it in the kitchen and works well","pred_label":1},{"clean_reviews":"have the echo wanted something smaller in another room they work together great","pred_label":1},{"clean_reviews":"love it personally prefer spotify music so it great that was able to set that as default there not day that goes by without mu

In [None]:
import json

# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame
data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

Unnamed: 0,clean_reviews,pred_label
0,reached out to amazon because the device wante...,0
1,excelente lo unico es que no esta en español,0
2,works with blink video cameras does not work w...,0
3,,0
4,good speaker for bedroom or office,1
5,love this have it in the kitchen and works well,1
6,have the echo wanted something smaller in anot...,1
7,love it personally prefer spotify music so it ...,1
8,waste of money over the echo generation 2 you ...,0
9,got this few months ago with the idea that it ...,0


In [None]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
2842,"I reached out to Amazon, because the device wa...",0,reached out to amazon because the device wante...,0
745,"Excelente, lo unico es que no esta en español.",1,excelente lo unico es que no esta en español,0
1649,Works with Blink video cameras. Does not work...,1,works with blink video cameras does not work w...,0
2876,,0,,0
2994,Good speaker for bedroom or office,1,good speaker for bedroom or office,1
202,Love this! Have it in the kitchen and works w...,1,love this have it in the kitchen and works well,1
2582,Have the echo wanted something smaller in anot...,1,have the echo wanted something smaller in anot...,1
3110,"Love it! I personally prefer Spotify music, so...",1,love it personally prefer spotify music so it ...,1
1422,Waste of money over the echo generation 2. Yo...,0,waste of money over the echo generation 2 you ...,0
2098,Got this a few months ago with the idea that i...,0,got this few months ago with the idea that it ...,0


In [None]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

array([[ 8,  0],
       [ 2, 10]])

### OpenAI API Config

In [None]:
!pip install openai==0.27.0



In [None]:
import openai
from google.colab import userdata

OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
openai.api_key  = OPENAI_API_KEY

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo-0125"):

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)

  return response.choices[0].message["content"]

In [None]:
prompt = "Why is the sky blue?"

chatgpt_response = get_completion(prompt)

RateLimitError: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [None]:
chatgpt_response

NameError: name 'chatgpt_response' is not defined

#### Batching API Calls (Single Shot)

In [None]:
test_set.shape

In [None]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

In [None]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [None]:
import time

def gpt_completion_function(batch,current_batch,total_batch,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [None]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count))

In [None]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

In [None]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

In [None]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

### Batching API Calls: Gemini API

In [None]:
test_set.shape

In [None]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

In [None]:
batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [None]:
import time

def gemini_completion_function(batch,current_batch,total_batch):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling Gemini API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)
  response = model.generate_content(prompt)
  time.sleep(5)

  return response

In [None]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gemini_completion_function(batches[i],i,batch_count))

In [None]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.text.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

In [None]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

In [None]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

confusion_matrix(y_true, y_pred)

## Batching API Calls: ChatGPT (Few Shot)

In [None]:
test_set.shape

In [None]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

In [None]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [None]:
import time

def gpt_completion_function(batch,current_batch,total_batch,train_sample,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  sample_json_data = train_sample[['clean_reviews','label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
  Examples of good Sentiment Analysis Classification are provided between separator ####.
  These examples are for your reference, not to be included in your final output.

  ```
  {json_data}
  ```
  ####
  {sample_json_data}
  ####
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [None]:
train_sample = train_set.sample(4)

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count,train_sample))

In [None]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

In [None]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

In [None]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")