## Working Environment



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
%cd /content/drive/MyDrive/amazon_alexa.tsv

[Errno 20] Not a directory: '/content/drive/MyDrive/amazon_alexa.tsv'
/content


In [4]:
!ls

drive  sample_data


## Import Dataset

In [9]:
import pandas as pd

data = pd.read_csv('/amazon_alexa.tsv', sep='\t')
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [10]:
mydata = data[['verified_reviews','feedback']]
mydata.columns = ['review','label']

mydata.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [11]:
mydata.value_counts('label')

label
1    2893
0     257
Name: count, dtype: int64

In [12]:
# Count the occurrences of each label
label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class
rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly
if rows_to_drop > 0:
   data_majority = mydata[mydata["label"] == 1]
   data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
   data_balanced = mydata.copy()

# Check the new class balance
print(data_balanced["label"].value_counts())

label
1    257
0    257
Name: count, dtype: int64


## Data Perprocessing

In [13]:
import re

def clean_text(text):
  # Remove special characters and punctuation
  text = re.sub(r"[^\w\s]", " ", text)

  # Remove single characters
  text = re.sub(r"\b[a-zA-Z]\b", " ", text)

  # Remove HTML tags
  text = re.sub(r"<[^>]*>", " ", text)

  # Lowercase the text
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)

  # Trim leading and trailing spaces
  text = text.strip()

  return text

In [15]:
import pandas as pd
import re

# Extract the review column as a list
reviews = data_balanced['review'].tolist()

# Clean the text in the list, handling non-string elements
cleaned_reviews = []
for review in reviews:
    if isinstance(review, str):  # Check if the review is a string
        cleaned_reviews.append(clean_text(review))
    else:
        cleaned_reviews.append('')  # Or handle it as needed

# Add the cleaned reviews as a new column to the DataFrame
data_balanced['clean_reviews'] = cleaned_reviews

In [16]:
data_balanced

Unnamed: 0,review,label,clean_reviews
4,Music,1,music
19,I liked the original Echo. This is the same bu...,1,liked the original echo this is the same but s...
26,"I love my Echo. It's easy to operate, loads of...",1,love my echo it easy to operate loads of fun i...
39,This is my first digital assistant so I'm givi...,1,this is my first digital assistant so giving t...
46,"It's like Siri, in fact, Siri answers more acc...",0,it like siri in fact siri answers more accurat...
...,...,...,...
3096,The product sounded the same as the emoji spea...,0,the product sounded the same as the emoji spea...
3102,"Works great, love the fact you can play the sa...",1,works great love the fact you can play the sam...
3116,I enjoy it. Still discovering new uses.,1,enjoy it still discovering new uses
3121,I like the hands free operation vs the Tap. We...,1,like the hands free operation vs the tap we us...


# Data Split

In [17]:
import pandas as pd

# Assuming your DataFrame is called "df"
total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

## Sentiment w/ LLM

## Setting up Gemini API

In [18]:
!pip install -q -U google-generativeai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.3/718.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [19]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [29]:
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [30]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [31]:
model = genai.GenerativeModel('gemini-1.5-pro')

In [32]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 194 ms, sys: 20.7 ms, total: 214 ms
Wall time: 11.4 s


> As a large language model, I can't tell you the meaning of life. That's a question philosophers have pondered for millennia and there's no one right answer.  It's a question each person must answer for themselves. 
> 
> However, I can offer some ideas to help you explore this question:
> 
> * **Purpose and meaning are not the same:** Some find meaning in having a purpose, like a career goal or a family to build.  Others find meaning in the journey itself, finding joy in everyday experiences and connections. 
> * **It's about what matters to you:** What do you value? What brings you joy, fulfillment, and a sense of peace?  Exploring these questions can reveal what gives your life meaning. 
> * **Meaning can change and evolve:** What gives your life meaning today might not be the same thing tomorrow, and that's okay.  Life is a journey of growth and discovery.
> * **It's okay not to have all the answers:**  Some find meaning in the search itself.  Embrace the mystery and keep asking questions.
> 
> Here are some resources you might find helpful in your exploration:
> 
> * **Philosophical texts:**  Read about different perspectives on the meaning of life. Start with philosophers like Socrates, Plato, Aristotle, or explore different schools of thought like existentialism, stoicism, or absurdism.
> * **Spiritual or religious texts:** If you're drawn to spirituality, explore religious texts or teachings that resonate with you.
> * **Self-reflection:** Spend time in nature, meditate, journal, or talk to a trusted friend or therapist. Reflect on your values, experiences, and aspirations. 
> 
> Ultimately, the meaning of life is what you make it. It's a journey of self-discovery, connection, and finding what makes your heart sing. 


## Single API Call

In [33]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label'] = ''

test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
1233,I purchased this on prime day mostly as a pres...,1,purchased this on prime day mostly as present ...,
2197,,0,,
499,Now I'm weary about these picking up conversat...,0,now weary about these picking up conversations...,
382,Poor quality. Gave it away.,0,poor quality gave it away,
2664,HANDY AS HELL 10/10 RECOMMEND,1,handy as hell 10 10 recommend,
900,It's got great sound and bass but it doesn't w...,0,it got great sound and bass but it doesn work ...,
566,The second one which was a refurbished model d...,0,the second one which was refurbished model did...,
668,It's ok. The speaker is pretty terrible. Googl...,0,it ok the speaker is pretty terrible google ho...,
2095,"As an echo its great, Good sound quality and f...",0,as an echo its great good sound quality and fu...,
1716,Need to be able to connect to more 3rd party v...,0,need to be able to connect to more 3rd party v...,


In [34]:
# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data
print(json_data)

[{"clean_reviews":"purchased this on prime day mostly as present for myself because found having the echo plus on my kitchen counter to be overwhelming when cooking cutting cleaning etc especially when our counters aren very wide to begin with after going back and forth between this and the show decided on this because felt it had all the same features but is more compact can honestly say have no regrets while probably wouldn watch movie been dying to see on this it great way to listen to music watch quick youtube video etc even attempted watching jaws on this and while it wasn the most comfortable thing for my eyes to do it wasn as bad as one would think would definitely recommend","pred_label":""},{"clean_reviews":"","pred_label":""},{"clean_reviews":"now weary about these picking up conversations when you are not actively using them heard about this happening on the news have 3 4 so this is very concerning wish amazon would release something widespread or push some firmware to preve

In [35]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)


You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
[{"clean_reviews":"purchased this on prime day mostly as present for myself because found having the echo plus on my kitchen counter to be overwhelming when cooking cutting cleaning etc especially when our counters aren very wide to begin with after going back and forth between this and the show decided on this because felt it had all the same features but is more compact can honestly say have no regrets while probably wouldn watch movie been dying to see on this it great way to listen to music watch quick 

In [36]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_reviews":"purchased this on prime day mostly as present for myself because found having the echo plus on my kitchen counter to be overwhelming when cooking cutting cleaning etc especially when our counters aren very wide to begin with after going back and forth between this and the show decided on this because felt it had all the same features but is more compact can honestly say have no regrets while probably wouldn watch movie been dying to listen to music watch quick youtube video etc even attempted watching jaws on this and while it wasn the most comfortable thing for my eyes to do it wasn as bad as one would think would definitely recommend","pred_label":1},{"clean_reviews":"","pred_label":""},{"clean_reviews":"now weary about these picking up conversations when you are not actively using them heard about this happening on the news have 3 4 so this is very concerning wish amazon would release something widespread or push some firmware to prevent this if they have have

In [38]:
import json

# Clean the data by stripping potential extra characters
json_data = response.text.strip("`\n")  # Strip newlines as well

# Check if the response starts and ends with square brackets (indicating a JSON array)
if json_data.startswith('[') and json_data.endswith(']'):
    # Load the cleaned data and convert to DataFrame
    data = json.loads(json_data)
    df_sample = pd.DataFrame(data)
else:
    print("Unexpected response format. The model might not be returning a valid JSON array.")
    print(json_data)  # Print the raw response for inspection

df_sample

Unnamed: 0,clean_reviews,pred_label
0,purchased this on prime day mostly as present ...,1.0
1,,
2,now weary about these picking up conversations...,0.0
3,poor quality gave it away,0.0
4,handy as hell 10 10 recommend,1.0
5,it got great sound and bass but it doesn work ...,0.0
6,the second one which was refurbished model did...,0.0
7,it ok the speaker is pretty terrible google ho...,0.0
8,as an echo its great good sound quality and fu...,0.0
9,need to be able to connect to more 3rd party v...,0.0


In [39]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
1233,I purchased this on prime day mostly as a pres...,1,purchased this on prime day mostly as present ...,1.0
2197,,0,,
499,Now I'm weary about these picking up conversat...,0,now weary about these picking up conversations...,0.0
382,Poor quality. Gave it away.,0,poor quality gave it away,0.0
2664,HANDY AS HELL 10/10 RECOMMEND,1,handy as hell 10 10 recommend,1.0
900,It's got great sound and bass but it doesn't w...,0,it got great sound and bass but it doesn work ...,0.0
566,The second one which was a refurbished model d...,0,the second one which was refurbished model did...,0.0
668,It's ok. The speaker is pretty terrible. Googl...,0,it ok the speaker is pretty terrible google ho...,0.0
2095,"As an echo its great, Good sound quality and f...",0,as an echo its great good sound quality and fu...,0.0
1716,Need to be able to connect to more 3rd party v...,0,need to be able to connect to more 3rd party v...,0.0


In [46]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

# Convert y_pred to numeric, handling non-numeric values
y_pred_numeric = pd.to_numeric(y_pred, errors='coerce').fillna(-1).astype(int)

# Filter out rows with missing predictions (-1)
valid_indices = y_pred_numeric != -1
y_true_valid = y_true[valid_indices]
y_pred_valid = y_pred_numeric[valid_indices]

confusion_matrix(y_true_valid, y_pred_valid)


array([[9, 0],
       [0, 9]])