In [24]:

import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

from tqdm import tqdm

from pinecone import Pinecone, PodSpec, ServerlessSpec
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

from dotenv import load_dotenv,find_dotenv

load_dotenv(find_dotenv())
     


False

In [25]:


from langchain.chains import RetrievalQA


In [26]:

# Extract data from files
data = []
with gzip.open('D:\\Downloads\\Appliances.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
        
metadata = []
with gzip.open('D:\\Downloads\\meta_Appliances.json.gz') as f:
    for l in f:
        metadata.append(json.loads(l.strip()))

In [27]:
# Load the data to dataframes

df = pd.DataFrame.from_dict(data)
df = df[df['reviewText'].notna()]

df_meta=pd.DataFrame.from_dict(metadata)

In [28]:

# Truncate the reviewText

max_text_length=400
def truncate_review(text):
    return text[:max_text_length]

df['truncated']=df.apply(lambda row: truncate_review(row['reviewText']),axis=1)

In [29]:

# Look for productIds with enough reviews

df.groupby('asin').count().sort_values('overall')

Unnamed: 0_level_0,overall,vote,verified,reviewTime,reviewerID,style,reviewerName,reviewText,summary,unixReviewTime,image,truncated
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
B00GS3TIQA,1,0,1,1,1,0,1,1,1,1,0,1
B00HNQDHYU,1,0,1,1,1,0,1,1,1,1,0,1
B00HNQDKZQ,1,0,1,1,1,0,1,1,1,1,0,1
B00HNQDMGI,1,0,1,1,1,0,1,1,1,1,0,1
B00HNQDMLI,1,0,1,1,1,0,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
B0045LLC7K,2934,76,2934,2934,2934,0,2934,2934,2934,2934,11,2934
B00KJ07SEM,3199,82,3199,3199,3199,0,3199,3199,3199,3199,24,3199
B0014CN8Y8,4048,430,4048,4048,4048,0,4048,4048,4048,4048,132,4048
B004UB1O9Q,5699,183,5699,5699,5699,5544,5699,5699,5699,5699,52,5699


In [30]:

# Work on only a slice of the dataframe

df = df.loc[df['asin'] == 'B00KJ07SEM'].copy()

In [6]:
len(df)

3199

In [31]:
df = df.head(1000)

In [None]:
df.groupby('asin').count().sort_values('overall')

In [32]:

# Import and apply embeddings from HuggingFace
# Warning! Be careful when/if applying embeddings from OpenAI like this - the full review dataframe is more than 800k rows.

##Word2Vec spaCy NltK Embeddings obviously work as well since it's the same underlying data


from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()



##model_name = 'text-embedding-ada-002'

##embeddings = OpenAIEmbeddings(model=model_name, openai_api_key=OPENAI_API_KEY)


In [33]:

df['embeddings']=df.apply(lambda row: embeddings.embed_query(row['truncated']),axis=1)
     

In [13]:
col_from_df1 = df['truncated']
col_from_df2 = df['embeddings']


new_df = pd.DataFrame({'id': col_from_df1, 'values': col_from_df2})

In [14]:
new_df

Unnamed: 0,id,values
309512,Im not buying the GE one again. This one works...,"[0.02482719160616398, 0.01006878912448883, 0.0..."
309513,Removed the GE MWF Smartwater filter inserted ...,"[0.022194426506757736, 0.008340789005160332, 0..."
309514,This a good filter and fits our needs quite we...,"[-0.004117485601454973, -0.04839412495493889, ..."
309515,Update: Within hours of my posting this review...,"[0.02946353890001774, 0.014538662508130074, -0..."
309516,I have a GE two door refrigerator and use to b...,"[0.037170182913541794, 0.033013369888067245, 0..."
...,...,...
310508,I'm very happy to have found a filter for my r...,"[0.03357476368546486, -0.006093936040997505, 0..."
310509,Works just as well as manufacturers brand with...,"[0.0094750439748168, 0.044361647218465805, -0...."
310510,Seems to work just as well as the original GE ...,"[0.02000478096306324, 0.019915636628866196, 0...."
310511,"Fits perfectly, easy to install, no problems. ...","[0.00936031062155962, 0.05733988434076309, -0...."


In [None]:
# Prepare training and test sets for training Random Forest Regressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(
    list(df.embeddings.values),
    df.overall,
    test_size = 0.2,
    random_state=1
)
     

In [None]:

# Train and calculate mean absolute error

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=150)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create the individual models
model1 = RandomForestClassifier(n_estimators=150)
model2 = SVC(kernel='linear', C=1, probability=True)

# Create the voting classifier
voting_model = VotingClassifier(estimators=[
    ('rf', model1), 
    ('svm', model2)], 
    voting='soft')

# Train the model
voting_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = voting_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(mae)

In [None]:

pc = Pinecone(api_key="69d4284b-023c-4cec-8c2c-59a90f3f743a")

pc.create_index(
  name="amzn-rwf",
  dimension=768,
  metric="cosine",
  spec=PodSpec(
    environment="us-east4-gcp",
    pod_type="p1.x1",
    pods=1
  
  )
)



In [None]:

pc = Pinecone(api_key="69d4284b-023c-4cec-8c2c-59a90f3f743a")




pc.list_indexes()

In [34]:
# Send embedding vectors to Pinecone with Langchain
from pinecone import Pinecone, PodSpec, ServerlessSpec
import os
pinecone_api_key = os.environ.get("69d4284b-023c-4cec-8c2c-59a90f3f743a")
f##rom pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = "69d4284b-023c-4cec-8c2c-59a90f3f743a"

# configure client
pc = Pinecone(api_key=api_key)

from langchain.vectorstores import Pinecone
vstore = Pinecone.from_documents(texts, embeddings, index_name='amzn-rwf')

AttributeError: 'str' object has no attribute 'page_content'

In [9]:
# Create list with truncated review texts

texts=df['truncated'].tolist()

In [None]:
obj(texts)

In [None]:
dfembeddings = df['embeddings']

dataset = pd.DataFrame({'texts': texts,'embeddings' : dfembeddings})

In [None]:
##df.head()
'''
dataset=dataset.rename(columns={'embeddings':'values','texts':'id'})
data=df[['truncated','embeddings']].to_dict(orient='records')
data=data.rename(columns={'embeddings':'values','texts':'id'})



rdf=df.rename(columns={'embeddings':'values','truncated':'id'})
'''

In [15]:

pc = Pinecone(api_key="69d4284b-023c-4cec-8c2c-59a90f3f743a")


index = pc.Index("amzn-rwf")






# Define the batch size
batch_size = 100

# Process in batches
for i in range(0, len(df), batch_size):
    # Define the end of the batch
    j = min(i + batch_size, len(df))

    # Create the batch
    # Convert each row to (id, vector) format
    batch = [(row['id'], row['values']) for index, row in new_df[i:j].iterrows()]

    # Upsert the batch
    index.upsert(vectors=batch)

In [None]:
for batch in dataset.iter_documents(batch_size=100):
    index.upsert(batch)

In [None]:

from langchain.vectorstores import Pinecone as vc

vstore = vc.add_texts(texts, embeddings, index_name='amzn-rwf')
vstore = Pinecone.from_texts(texts, embeddings, index_name='cxanalytics')

In [38]:
from pinecone import Pinecone
pc = Pinecone(api_key="69d4284b-023c-4cec-8c2c-59a90f3f743a")

index = pc.Index("amzn-rwf")



##index.describe_index_stats()



from langchain.vectorstores import Pinecone as VectorPine

##text_field = "text"


vstore = VectorPine(
    index, embeddings.embed_query, 'text'
)

In [39]:
query = "quality?"

vectorstore.similarity_search(
    query, k=10

)



ValueError: The argument order for `query()` has changed; please use keyword arguments instead of positional arguments. Example: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')

In [41]:

# Import RetrievalQA adn ChatOpenAPI and define review_chain in order to have GPT-4 access the review data

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
os.environ['OPENAI_API_KEY'] = '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'
'OPENAI_API_KEY' = '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'
##Connects to OpenAI ChatGPT and does creates a Question and Answering Chain

##chat = ChatOpenAI(model_name="gpt-3.5-turbo-0301",temperature=0.0)


model_name="gpt-3.5-turbo-0301"

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

query = "product?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

SyntaxError: cannot assign to literal here. Maybe you meant '==' instead of '='? (2546946677.py, line 6)

In [35]:
# Define the task for GPT-4 and run the chain

q="""
The reviews you see are for a product called 'Powerstep Pinnacle Orthotic Shoe Insoles'.
What is the overall impression of these reviews? Give most prevalent examples in bullets. 
What do you suggest we focus on improving?
"""

result=review_chain.run(q)
print(result)
     

TypeError: 'HuggingFaceEmbeddings' object is not callable

In [None]:
"""Rename columns in dataframe and create metadatafield in order to do upserts with Pinecone's Python client directly
"""
df=df.rename(columns={'embeddings':'values','reviewerID':'id'})
df['metadata']=df.apply(lambda row: dict(rating=row['overall']), axis=1)

In [None]:

# Create two copies of data, one for the upsert and one for extracting reviewText from ids return from the filtered similarity search

data=df[['metadata','values','id']].to_dict(orient='records')
data_local=df[['metadata','values','reviewText','id']].to_dict(orient='records')

In [None]:

# Create the Pinecone index
#pinecone.create_index(name='amazonfashionembeddings', metric='euclidean', dimension=768)
##pinecone.create_index(name='afefilter', metric='euclidean', dimension=768)

pc.create_index(
  name="amzn-wf-filter",
  dimension=768,
  metric="cosine",
  spec=PodSpec(
    environment="us-east4-gcp",
    pod_type="p1.x1",
    pods=1
  
  )
)

In [None]:
index = pinecone.Index("amzn-fefilter")

In [None]:
# Upload the data in batches of 50

from tqdm.auto import tqdm

for i in tqdm(range(0, len(data), 50)):
    j= i + 50
    if j > len(data):
        j = len(data)
    batch = data[i: j]
    index.upsert(vectors=batch)

In [None]:

# Run a filtered similarity search

query=embeddings.embed_query("will buy again")
results = index.query(queries=[query], top_k=100, filter={'rating': {'$eq': 4.0}})
print(results)

In [None]:
# Get the rating from id

get_rating_from_id = {
    x['id']: {
        'rating': x['metadata']['rating'],
        'review': x['reviewText'],
    } for x in data_local}
     

In [None]:
# Python function that retrieves reviews matching query and specific rate

def review_and_rating(query,rating):
    query=embeddings.embed_query(query)
    results = index.query(queries=[query], top_k=100, filter={'rating': {'$eq': rating}})
    ids = [i['id'] for i in results['results'][0]['matches']]
    l=[]
    for i in ids:
        l.append(get_rating_from_id[i])
    return pd.DataFrame(l)
     

In [None]:

# Repurchase list and winback list

repurchase_list=review_and_rating('will purchase again', 5.0)

winback=review_and_rating('disappointed', 1.0)


In [None]:
winback

In [None]:
repurchase_list

In [None]:
winbackb=review_and_rating('disappointed', 1.0)

In [None]:
winback=review_and_rating('happy', 4.0)

In [None]:
winback

In [None]:
pc.delete_index(name='amzn-fe')