In [10]:
'''
TODO:
1. Create new attribute named question_embedding which is embedding version of question_text
2. Create new attribute named product_attributes which are the strings include bullet_points, product_description, brand_name, item_name
3. Create new attribute named product_embedding which is embedding version of product_attributes
4. Create new attribute named answer_embedding which is embedding version of answer_text
5. Code the flow of RAG: Retrieve Question (25) -> Retrieve Product (20) -> Retrieve Answer (15) -> Top 5 Answer
6. Prompt strategy for processing answer (major voting, give opinion)
FUTURE WORK:
- Not only use PQA data, but also use Amazon Review Data
- Let LLM classify question first (WH or Yes/No), then retrieve later
'''

'\nTODO:\n1. Create new attribute named question_embedding which is embedding version of question_text\n2. Create new attribute named product_attributes which are the strings include bullet_points, product_description, brand_name, item_name\n3. Create new attribute named product_embedding which is embedding version of product_attributes\n4. Create new attribute named answer_embedding which is embedding version of answer_text\n5. Code the flow of RAG: Retrieve Question (25) -> Retrieve Product (20) -> Retrieve Answer (15) -> Top 5 Answer\n6. Prompt strategy for processing answer (major voting, give opinion)\nFUTURE WORK:\n- Not only use PQA data, but also use Amazon Review Data\n- Let LLM classify question first (WH or Yes/No), then retrieve later\n'

***Load Data***

In [8]:
import pandas as pd
import os
from typing import Union, List

# get single dataset


def get_dataset(dir: str = '../data/PQA', dataset: str = 'accessories') -> Union[pd.DataFrame, None]:
    dataset_dir = f'amazon_pqa_{dataset}.json'
    path = os.path.join(dir, dataset_dir)
    try:
        df = pd.read_json(path, lines = True)
        return df
    except Exception as e:
        print(f'Error loading dataset {dataset}: {e}')
    return None

# get multi dataset
def get_multi_dataset(dir: str = '../data/PQA', list_dataset: List[Union[str, None]] = []) -> List[Union[pd.DataFrame, None]]:
    pass

dir = '../data/PQA'
dataset = 'accessories'

df = get_dataset(dir, dataset)

In [29]:
df.head()

Unnamed: 0,question_id,question_text,asin,bullet_point1,bullet_point2,bullet_point3,bullet_point4,bullet_point5,product_description,brand_name,item_name,question_type,answer_aggregated,answers,question_text_embedding
0,Tx3223771NA0J0W,Is the end where the garden hose attaches plas...,B009360BEW,Use the Deck Washer to quickly clean your deck...,The Deck Washer is easy to use -- just attach ...,Built-in nozzles on the broom provide pressure...,,,Instead of sweeping off your deck and then hos...,,Deck Washer,WH,,[{'answer_text': 'Plastic'}],
1,TxASRMZNTT0NON,"Does the 20% middle portion, or arm rest, get ...",B00E63VX26,Vehicle tested before engineering approval,Ease of installation with Super-Grip slip resi...,"Light fast, color fast materials will not run ...",Form fit wrap around design - multidirectional...,Center armrest/console covers and headrest cov...,,Fia,Fia TR49-39 BLACK Custom Fit Front Seat Cover ...,WH,,"[{'answer_text': 'No, it does not.'}, {'answer...",
2,Tx3PT0N7PHAZ6I0,Is this kit supposed to include a cover for th...,B00E63VX26,Vehicle tested before engineering approval,Ease of installation with Super-Grip slip resi...,"Light fast, color fast materials will not run ...",Form fit wrap around design - multidirectional...,Center armrest/console covers and headrest cov...,,Fia,Fia TR49-39 BLACK Custom Fit Front Seat Cover ...,yes-no,no,[{'answer_text': 'No it does not cover center ...,
3,Tx2LZW18Z5P2ZDH,Does the seat cover stay in place when getting...,B00E63VX26,Vehicle tested before engineering approval,Ease of installation with Super-Grip slip resi...,"Light fast, color fast materials will not run ...",Form fit wrap around design - multidirectional...,Center armrest/console covers and headrest cov...,,Fia,Fia TR49-39 BLACK Custom Fit Front Seat Cover ...,yes-no,neutral,"[{'answer_text': 'no, they are very secure.'},...",
4,Tx1MIMQEE2OCTBP,I made a mistake on size is there any way to a...,B014LMMAY6,Amorbilt - Condor Flex Cap Bundle offers you: ...,Moisture Wicking Elastic Head Band draws sweat...,Two Armorbilt High Quality Hook & Loop Patches...,Front loop patch panel with Embossed Condor Lo...,Additional Features - 6 fabric panels - Embroi...,Condor Flex cap bundled with Armorbilt patches...,,Condor Flex Tactical Cap (Black Small) Bundle ...,WH,,[{'answer_text': 'Your best option is to send ...,


***Data Preprocessing***

In [31]:
from tqdm import tqdm

tqdm.pandas()

product_attributes = ['bullet_point1', 'bullet_point2', 'bullet_point3', 'bullet_point4', 'bullet_point5', 'product_description', 'brand_name', 'item_name', 'question_type']
df['product_attributes'] = df[product_attributes].progress_apply(lambda row: row.to_json(), axis=1)

100%|██████████| 238603/238603 [00:04<00:00, 51567.00it/s]


***Initiate SentenceTransformer Model***

In [32]:
from sentence_transformers import SentenceTransformer


# Initiate Model
model = SentenceTransformer("all-MiniLM-L6-v2")

***Embedding Function***

In [33]:
from tqdm import tqdm
import numpy as np
from typing import Union, List

def progress_embeddings(model, df: pd.DataFrame, attribute: str, row_number: int = len(df), new_attribute: bool = False) -> Union[np.array, None]:
    f''' 
    Args:
    Returns:
        embeddings: embeddings of {attribute}
        df_used: returned dataset with {row_number} first rows
    '''
    tqdm.pandas()
    # create embedding
    try:
        embeddings = df[attribute][:row_number].progress_apply(lambda row: model.encode(row)).to_list()
    except Exception as e:
        print(f'Error encoding {attribute}: {e}')
        return None
    
    # use only k row of data
    df_used = df[:row_number]
    # create new attribute
    if new_attribute == True:
        new_column_name = f'{attribute}_embedding'
        try:
            df_used[new_column_name] = embeddings
        except Exception as e:
            print(f'Error creating {new_column_name}: {e}')
            return None
    # convert embeddings to np.array
    try:
        embeddings = np.array(embeddings)
    except Exception as e:
        print(f'Error converting embedding of {attribute} to numpy array: {e}')
    return embeddings, df_used

***Create Question Embedding***

In [42]:
from sentence_transformers import SentenceTransformer

param = {
    'model': SentenceTransformer("all-MiniLM-L6-v2"),
    'df': df,
    'attribute': 'question_text',
    'row_number': 1000,
    'new_attribute': True
}

questions_embedding, questions_df = progress_embeddings(**param)

100%|██████████| 1000/1000 [00:15<00:00, 63.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used[new_column_name] = embeddings


***Create Products Embedding***

In [43]:
param = {
    'model': SentenceTransformer("all-MiniLM-L6-v2"),
    'df': question_df,
    'attribute': 'product_attributes',
    'row_number': 1000,
    'new_attribute': True
}

products_embedding, products_df = progress_embeddings(**param)

100%|██████████| 1000/1000 [00:40<00:00, 24.90it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used[new_column_name] = embeddings


***Create Answers Embedding***

In [44]:
param = {
    'model': SentenceTransformer("all-MiniLM-L6-v2"),
    'df': products_df,
    'attribute': 'answers',
    'row_number': 1000,
    'new_attribute': True
}

answers_embedding, answers_df = progress_embeddings(**param)

100%|██████████| 1000/1000 [00:26<00:00, 38.18it/s]

Error converting embedding of answers to numpy array: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1000,) + inhomogeneous part.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used[new_column_name] = embeddings


In [46]:
experiment_df = answers_df

In [47]:
experiment_df

Unnamed: 0,question_id,question_text,asin,bullet_point1,bullet_point2,bullet_point3,bullet_point4,bullet_point5,product_description,brand_name,item_name,question_type,answer_aggregated,answers,question_text_embedding,product_attributes,product_attributes_embedding,answers_embedding
0,Tx3223771NA0J0W,Is the end where the garden hose attaches plas...,B009360BEW,Use the Deck Washer to quickly clean your deck...,The Deck Washer is easy to use -- just attach ...,Built-in nozzles on the broom provide pressure...,,,Instead of sweeping off your deck and then hos...,,Deck Washer,WH,,[{'answer_text': 'Plastic'}],"[-0.031283803, 0.08195734, 0.011494885, -0.030...","{""bullet_point1"":""Use the Deck Washer to quick...","[-0.07433951, 0.04471903, 0.096906714, -0.0679...","[[-0.083055, 0.050426304, 0.012281296, 0.04287..."
1,TxASRMZNTT0NON,"Does the 20% middle portion, or arm rest, get ...",B00E63VX26,Vehicle tested before engineering approval,Ease of installation with Super-Grip slip resi...,"Light fast, color fast materials will not run ...",Form fit wrap around design - multidirectional...,Center armrest/console covers and headrest cov...,,Fia,Fia TR49-39 BLACK Custom Fit Front Seat Cover ...,WH,,"[{'answer_text': 'No, it does not.'}, {'answer...","[0.0450177, 0.10086544, 0.0949286, 0.036628306...","{""bullet_point1"":""Vehicle tested before engine...","[-0.03536468, 0.08948276, 0.033056, 0.00270799...","[[0.022350715, -0.019383874, -0.0065564965, 0...."
2,Tx3PT0N7PHAZ6I0,Is this kit supposed to include a cover for th...,B00E63VX26,Vehicle tested before engineering approval,Ease of installation with Super-Grip slip resi...,"Light fast, color fast materials will not run ...",Form fit wrap around design - multidirectional...,Center armrest/console covers and headrest cov...,,Fia,Fia TR49-39 BLACK Custom Fit Front Seat Cover ...,yes-no,no,[{'answer_text': 'No it does not cover center ...,"[0.0033588856, 0.057596724, 0.026277171, -0.02...","{""bullet_point1"":""Vehicle tested before engine...","[-0.0337851, 0.08812204, 0.03299938, 0.0012121...","[[0.020091904, 0.011482883, 0.029666347, -0.09..."
3,Tx2LZW18Z5P2ZDH,Does the seat cover stay in place when getting...,B00E63VX26,Vehicle tested before engineering approval,Ease of installation with Super-Grip slip resi...,"Light fast, color fast materials will not run ...",Form fit wrap around design - multidirectional...,Center armrest/console covers and headrest cov...,,Fia,Fia TR49-39 BLACK Custom Fit Front Seat Cover ...,yes-no,neutral,"[{'answer_text': 'no, they are very secure.'},...","[0.02471458, 0.08257935, 0.0008352714, 0.03235...","{""bullet_point1"":""Vehicle tested before engine...","[-0.0337851, 0.08812204, 0.03299938, 0.0012121...","[[0.0064587016, 0.05132395, -0.057384215, 0.01..."
4,Tx1MIMQEE2OCTBP,I made a mistake on size is there any way to a...,B014LMMAY6,Amorbilt - Condor Flex Cap Bundle offers you: ...,Moisture Wicking Elastic Head Band draws sweat...,Two Armorbilt High Quality Hook & Loop Patches...,Front loop patch panel with Embossed Condor Lo...,Additional Features - 6 fabric panels - Embroi...,Condor Flex cap bundled with Armorbilt patches...,,Condor Flex Tactical Cap (Black Small) Bundle ...,WH,,[{'answer_text': 'Your best option is to send ...,"[0.037850633, 0.10897374, 0.0068275845, 0.0261...","{""bullet_point1"":""Amorbilt - Condor Flex Cap B...","[-0.03869734, 0.06244869, -0.03304579, 0.00483...","[[0.0061499244, 0.0141396765, 0.094967134, -0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,TxCGXY5X3OLU3O,Hi I have a heat only system with 2 wires. Wil...,B00755BZZC,Model number: ACC-0436,Country of Origin: China,No batteries required,Brand name: Venstar,,Don't get stuck with a tangle of wires thanks ...,,Venstar ACC0436 2-Wire Kit for all 24VAC Therm...,yes-no,neutral,[{'answer_text': 'This does work with a Nest a...,"[-0.012854539, -0.007450397, -0.0881916, -0.00...","{""bullet_point1"":""Model number: ACC-0436"",""bul...","[-0.09886482, -0.03026899, -0.047745757, 0.029...","[[-0.077236824, 0.047493123, -0.016157381, -0...."
996,Tx6UR717H6H1TC,How long is the wire on the transformer?,B00755BZZC,Model number: ACC-0436,Country of Origin: China,No batteries required,Brand name: Venstar,,Don't get stuck with a tangle of wires thanks ...,,Venstar ACC0436 2-Wire Kit for all 24VAC Therm...,WH,,"[{'answer_text': '5ft'}, {'answer_text': 'The ...","[-0.055789582, 0.11323771, -0.034521278, -0.01...","{""bullet_point1"":""Model number: ACC-0436"",""bul...","[-0.098936416, -0.029081164, -0.049533647, 0.0...","[[0.0035963915, 0.039748736, 0.0046752854, -0...."
997,TxE1MVNKL0IQF7,Does this cover the center flip down backrest/...,B000I5W2VC,Vehicle tested before engineering approval,Ease of installation with Super-Grip slip resi...,"Light fast, color fast materials will not run ...",Form fit wrap around design - multidirectional...,Center armrest/console covers and head rest co...,,Fia,Fia TR48-17 GRAY Custom Fit Front Seat Cover S...,yes-no,neutral,[{'answer_text': 'Part number TR48-17 Gray is ...,"[0.07295051, 0.038040716, 0.02552596, 0.006629...","{""bullet_point1"":""Vehicle tested before engine...","[-0.02616747, 0.0897499, 0.044964284, 0.003270...","[[0.029762179, 0.064092115, 0.11183441, -0.023..."
998,Tx2U9ZWXJXKYYWU,Is this a set of 8?,B000IZ5EF8,"Connectors allow for more efficient, less cost...",Protective outer coverings protect against eve...,GM-recommended replacement part for your GM ve...,"Offering the quality, reliability, and durabil...","Manufactured to GM OE specification for fit, f...",,ACDelco,ACDelco PT2135 GM Original Equipment Black Mul...,yes-no,no,"[{'answer_text': 'Only 1'}, {'answer_text': 'N...","[0.035148457, 0.01949776, 0.010930545, 0.03625...","{""bullet_point1"":""Connectors allow for more ef...","[-0.06885721, -0.0019001155, 0.0075429613, -0....","[[-0.01403691, -0.09062717, -0.050095078, -0.0..."
