### E-Commerce Multisource Review Product Analysis & QA

##### **Importing key libraries**

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from dotenv import load_dotenv 
from datetime import datetime
import cohere
import faiss
from tqdm import tqdm
import praw

import re

import spacy
import torch
import asyncio
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio

import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline, AutoModelForSequenceClassification, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration, OpenAI

from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

import openai
import deepseek

from datasets import load_dataset

from googleapiclient.discovery import build
import time

import sys
import os
from pathlib import Path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir("/Users/emmanueladeleye/Documents/DS Projects/E-Commerce-Multisource-Review-Analaysis-Product-QA")

from src.scraper import *
from src.product_review import *

##### **Getting Started With Amazon**

In [2]:
#This project would be niched to tech and electronics product with a more prototyped focus on a small dataset. This confif paramater can be adjusted to larger dataset based on available memory
#Other configs include: 'raw_meta_Electronics' [5+GB], 'raw_meta_Cell_Phones_and_Accessories' [4+GB]

folder_path = "data/amzn_phones_accessories/"
# Creating a list of all files that ends with .parquet

parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

dfs = []
# Looping through each tsv file and reading each one
for file in parquet_files:
    file_path = os.path.join(folder_path, file)
    df_2023 = pd.read_parquet(file_path, engine='pyarrow')

    dfs.append(df_2023)
# Merging all dataframes    
merged_df_2023 = pd.concat(dfs, ignore_index=True)
print(f'Merged {len(parquet_files)} files with {merged_df_2023.shape[0]} rows.')

Merged 6 files with 1104420 rows.


In [3]:
# Dataset directory: https://www.kaggle.com/datasets/cynthiarempel/amazon-us-customer-reviews-dataset/data

folder_path = "data/amzn_tech_electronics/"

# Creating a list of all files that ends with .tsv
tsv_files = [f for f in os.listdir(folder_path) if f.endswith('.tsv')]

dfs = []

# Looping through each tsv file and reading each one
for file in tsv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, sep='\t',  encoding='utf-8', on_bad_lines='skip')
    dfs.append(df)

# Merging all dataframes    
merged_df = pd.concat(dfs, ignore_index=True)
print(f'Merged {len(tsv_files)} files with {merged_df.shape[0]} rows.')

Merged 4 files with 5337970 rows.


In [4]:
merged_df.product_category.value_counts()

product_category
Electronics           3091024
Camera                1800845
Software               341249
Mobile_Electronics     104850
2013-02-01                  1
2013-01-06                  1
Name: count, dtype: int64

In [5]:
merged_df.review_date.value_counts()

review_date
2015-01-03    8751
2015-01-05    8526
2014-12-29    8413
2015-01-07    8086
2015-01-04    7865
              ... 
1999-07-11       1
1999-05-02       1
1999-08-14       1
1999-08-11       1
1998-09-21       1
Name: count, Length: 5919, dtype: int64

In [6]:
merged_df['review_date'] = pd.to_datetime(merged_df['review_date'])

In [7]:
merged_df = merged_df[merged_df["review_date"]>="2010-01-01"]

In [8]:
merged_df["star_rating"].value_counts()

star_rating
5.0    2745906
4.0     812670
1.0     527138
3.0     366384
2.0     259014
Name: count, dtype: int64

In [9]:
df = merged_df_2023.copy()

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104420 entries, 0 to 1104419
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   main_category    992332 non-null   object 
 1   title            1104420 non-null  object 
 2   average_rating   1104420 non-null  float64
 3   rating_number    1104420 non-null  int64  
 4   features         1104420 non-null  object 
 5   description      1104420 non-null  object 
 6   price            1104420 non-null  object 
 7   images           1104420 non-null  object 
 8   videos           1104420 non-null  object 
 9   store            1088240 non-null  object 
 10  categories       1104420 non-null  object 
 11  details          1104420 non-null  object 
 12  parent_asin      1104420 non-null  object 
 13  bought_together  0 non-null        object 
 14  subtitle         1772 non-null     object 
 15  author           34 non-null       object 
dtypes: float64(1), int

In [11]:
df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Cell Phones & Accessories,Generic MC0148 Cell Phone Case for iPhone 6 - ...,3.5,3,"[100% Brand new and high quality, Compatible i...","[For iPhone 6 Case, Fashion Design Black High ...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Generic,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""6 x 4 x 4 inches"", ""It...",B00ORO8G7O,,,
1,Cell Phones & Accessories,32nd Designer book wallet PU leather case cove...,2.0,2,[Stratholme],[Stratholme],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",AMZ Original,"[Cell Phones & Accessories, Accessories]","{""Package Dimensions"": ""4.8 x 3.3 x 0.9 inches...",B00L4CJMNY,,,
2,Cell Phones & Accessories,Casemachine i5 Slimline Comp (White / White),2.0,1,"[TightStretch Silicone Polymer Bumper, Injecti...",[The All-New Casemachine i5 Slimline Comp Case...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Casemachine,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Special features"": ""Scratch resistant, Light...",B00IAS44RO,,,
3,Cell Phones & Accessories,"Moto Z2 Play Case, Harryshell Shock Absorption...",4.7,11,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Harryshell,[],"{""Product Dimensions"": ""0.79 x 1.18 x 0.79 inc...",B0747KLTL5,,,
4,Cell Phones & Accessories,"New iPhone 6 Plus 6s Plus (5.5"") Case, Darth V...",5.0,1,[],[],,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",APEX,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Package Dimensions"": ""6.5 x 5 x 0.5 inches"",...",B01BLT1DYK,,,


In [12]:
df["main_category"].value_counts()

main_category
Cell Phones & Accessories       883406
All Electronics                  39799
AMAZON FASHION                   23479
Computers                         6873
Books                             5010
Camera & Photo                    4876
Sports & Outdoors                 4682
Industrial & Scientific           4565
Amazon Home                       4358
Home Audio & Theater              2603
Portable Audio & Accessories      2467
Office Products                   1621
Tools & Home Improvement          1342
Musical Instruments               1247
Automotive                        1217
All Beauty                         935
Health & Personal Care             695
Toys & Games                       587
Arts, Crafts & Sewing              499
Car Electronics                    485
Video Games                        403
Amazon Devices                     281
Baby                               222
GPS & Navigation                   147
Pet Supplies                       147
Grocery    

In [13]:
df[df['main_category']=="Movies & TV"].iloc[0]["categories"]

array(['Cell Phones & Accessories', 'Cases, Holsters & Sleeves',
       'Armbands'], dtype=object)

In [14]:
df_2 = merged_df.copy()

In [15]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4711112 entries, 0 to 5236312
Data columns (total 15 columns):
 #   Column             Dtype         
---  ------             -----         
 0   marketplace        object        
 1   customer_id        int64         
 2   review_id          object        
 3   product_id         object        
 4   product_parent     int64         
 5   product_title      object        
 6   product_category   object        
 7   star_rating        float64       
 8   helpful_votes      float64       
 9   total_votes        float64       
 10  vine               object        
 11  verified_purchase  object        
 12  review_headline    object        
 13  review_body        object        
 14  review_date        datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(2), object(9)
memory usage: 575.1+ MB


In [16]:
df_2.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,2975964,R1NBG94582SJE2,B00I01JQJM,860486164,GoPro Rechargeable Battery 2.0 (HERO3/HERO3+ o...,Camera,5.0,0.0,0.0,N,Y,Five Stars,ok,2015-08-31
1,US,23526356,R273DCA6Y0H9V7,B00TCO0ZAA,292641483,Professional 58mm Center Pinch Lens Cap for CA...,Camera,5.0,0.0,0.0,N,Y,Love it!!!,"Perfect, even sturdier than the original!",2015-08-31
2,US,52764145,RQVOXO7WUOFK6,B00B7733E0,75825744,Spy Tec Z12 Motion Activated Intelligent Secur...,Camera,2.0,1.0,1.0,N,Y,Another Motion Detect Fail,"If the words, &#34;Cheap Chinese Junk&#34; com...",2015-08-31
3,US,47348933,R1KWKSF21PO6HO,B006ZN4U34,789352955,"Celestron UpClose G2 10x25 Monocular, Black (7...",Camera,5.0,0.0,0.0,N,Y,Exactly what I wanted and expected.,Exactly what I wanted and expected. Perfect fo...,2015-08-31
4,US,33680700,R38H3UO1J190GI,B00HUEBGMU,19067902,Vidpro XM-L Wired Lavalier microphone - 20' Au...,Camera,5.0,1.0,1.0,N,Y,Good mic at a Good Price...Not Canon Though.,I will look past the fact that they tricked me...,2015-08-31


In [17]:
df_2.columns, df.columns

(Index(['marketplace', 'customer_id', 'review_id', 'product_id',
        'product_parent', 'product_title', 'product_category', 'star_rating',
        'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
        'review_headline', 'review_body', 'review_date'],
       dtype='object'),
 Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
        'description', 'price', 'images', 'videos', 'store', 'categories',
        'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
       dtype='object'))

In [18]:
df_2["product_category"].value_counts()

product_category
Electronics           2777481
Camera                1596141
Software               239584
Mobile_Electronics      97906
Name: count, dtype: int64

In [19]:
df[df["rating_number"]==3.0].head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Cell Phones & Accessories,Generic MC0148 Cell Phone Case for iPhone 6 - ...,3.5,3,"[100% Brand new and high quality, Compatible i...","[For iPhone 6 Case, Fashion Design Black High ...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Generic,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""6 x 4 x 4 inches"", ""It...",B00ORO8G7O,,,
18,,Cell Phone Cover - Slim Fit - Hard Shell Plast...,3.7,3,[Compatible with iPhone 6 (NOT PLUS MODELS) Al...,[Don't risk your $800 phone with a cheap $5 ca...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Art Plates,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""5.7 x 2.9 x 0.4 inches...",B00UOFAJLQ,,,
23,Cell Phones & Accessories,6X - Spartan Shields Premium HD Screen Protect...,4.5,3,[Made from the highest quality Easy Install Fi...,"[Compatibility:, Samsung Galaxy S6, Spartan Sh...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Spartan Shield,"[Cell Phones & Accessories, Accessories, Maint...","{""Special features"": ""scratch resistant"", ""Oth...",B00TZ5UTIO,,,
25,,"Compatible with LG Stylo 5 Case, LG Stylo 5 Ph...",5.0,3,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Skyfree,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""6.5 x 3.2 x 0.6 inches...",B07ZKS3SF3,,,
33,Cell Phones & Accessories,Adjustable Cell Phone Holder Desk Stand ‚Äì Fold...,3.3,3,[Mount your Smartphone onto your desk or table...,[Adjustable Cell Phone Holder Desk Stand ‚Äì Fol...,9.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Cellet,"[Cell Phones & Accessories, Accessories, Autom...","{""Product Dimensions"": ""7 x 2.5 x 0.5 inches"",...",B07FK6P1C2,,,


In [20]:
df_2[df_2["star_rating"]==3.0].head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
5,US,30301059,R3NPIFKLR19NQA,B008MW6Y12,597683407,NIX 8 inch Hi-Res Digital Photo Frame with Mot...,Camera,3.0,0.0,0.0,N,Y,"The controls are pretty slow, and I can't get ...","The controls are pretty slow, and I can't get ...",2015-08-31
6,US,28282645,R3MBE6UCH3435E,B00TE8XKIS,35563334,Polaroid ZIP Mobile Printer Parent ASIN,Camera,3.0,8.0,8.0,N,N,"Fun toy for making stickers, but expensive to ...",The printer came in a small fairly plain box w...,2015-08-31
12,US,11438825,R1F4O6WOO2W461,B00X3HIM2U,444991975,Neewer Meike MK-XT1 Battery Grip (Replacement ...,Camera,3.0,3.0,3.0,N,Y,Can't grip the battery.,Same issue as others the battery won't stay la...,2015-08-31
38,US,46588192,R19WH1XJ3P76NV,B00A7AY3WE,183171237,-,Camera,3.0,0.0,0.0,N,Y,Three Stars,"There okay, use them here and there....",2015-08-31
42,US,15899329,R3VBAJTKZG737O,B0046V58Y2,350358427,Wasabi Power Battery and Charger for Canon BP-...,Camera,3.0,0.0,1.0,N,Y,"it works, but no power level indicator.",No power level indicator. not that most 3rd p...,2015-08-31


In [21]:
cat_wgt = df_2["product_category"].value_counts()/len(df_2)

In [22]:
cat_wgt["Electronics"]

0.589559534988767

In [23]:
# Observation shows that ratings between 3-4 usually contain more details about products and its sentiment
df_2015 = df_2[(df_2["star_rating"] > 3) & (df_2["star_rating"] <= 4)]

In [24]:
# Observation shows that ratings between 3-4 usually contain more details about products and its sentiment
df_2023 = df[(df["average_rating"] > 3) & (df["average_rating"] <= 4)]

In [25]:
len(df_2023)

395343

In [26]:
# Creating a sample function that helps generate a close distribution of categories to the population
electronics_wgt = cat_wgt["Electronics"]
mobile_elect_wgt = cat_wgt["Mobile_Electronics"]
camera_wgt = cat_wgt["Camera"]
software_wgt = cat_wgt["Software"]

def sampling_data(sample_size):

    print("Extracting electronics sample...")
    electronics_category = df_2015[df_2015["product_category"]=="Electronics"].sample(n=int(electronics_wgt*sample_size), random_state=16)
    print("Completed")

    print("Extracting mobile electronics sample...")
    mobile_elect_category = df_2015[df_2015["product_category"]=="Mobile_Electronics"].sample(n=int(mobile_elect_wgt*sample_size), random_state=16)
    print("Completed")

    print("Extracting camera sample...")
    camera_category = df_2015[df_2015["product_category"]=="Camera"].sample(n=int(camera_wgt*sample_size), random_state=16)
    print("Completed")

    print("Extracting software sample...")
    software_category = df_2015[df_2015["product_category"]=="Software"].sample(n=int(software_wgt*sample_size), random_state=16)
    print("Completed")
    
    sampled_df = pd.concat([electronics_category, mobile_elect_category, camera_category, software_category], axis=0)

    return sampled_df

In [27]:
new_df = sampling_data(500_000)

Extracting electronics sample...
Completed
Extracting mobile electronics sample...
Completed
Extracting camera sample...
Completed
Extracting software sample...
Completed


In [28]:
new_df["product_category"].value_counts()

product_category
Electronics           294779
Camera                169401
Software               25427
Mobile_Electronics     10390
Name: count, dtype: int64

In [29]:
len(new_df)

499997

In [30]:
def process_in_chunks(df, chunk_size, process_fn, d_type):
    chunks = []
    n=len(df)
    for start in range(0, n, chunk_size):
        end = start + chunk_size
        chunk = df.iloc[start:end].copy()
        processed = process_fn(chunk, d_type)
        chunks.append(processed)

    return pd.concat(chunks, ignore_index=True)    

def add_review_features(df, dtype):

    if dtype == 'old':
        df["Product_context"] = "Product_Details: " + df["product_title"] + "\nRating: " + df["star_rating"].astype(str)
        df["Full_review"] = "Product Title:" + df["product_title"] + "Headline:" + df["review_headline"] + "\nDetails + Review: " + df["review_body"]
    
        return df[["Product_context", "Full_review"]]
    
    elif dtype == 'new':
        df["Product_context"] = "Product_Details: " + df["details"] + "\nRating: " + df["average_rating"].astype(str)
        df["Full_review"] = "Headline:" + df["title"] + "\nDetails + Review: " + df["description"]
    
        return df[["Product_context", "Full_review"]]

In [31]:
chunk_size=100_000

amzn_2015 = process_in_chunks(new_df, chunk_size, add_review_features, "old")
amzn_2023 = process_in_chunks(df_2023, chunk_size, add_review_features, "new")

In [32]:
len(amzn_2023), len(amzn_2015)

(395343, 499997)

In [33]:
amzn_2023.head()

Unnamed: 0,Product_context,Full_review
0,"Product_Details: {""Product Dimensions"": ""6 x 4...",[Headline:Generic MC0148 Cell Phone Case for i...
1,"Product_Details: {""Package Dimensions"": ""7.56 ...",[Headline:Nokia CC-1005 Skin for N8 - Blue\nDe...
2,"Product_Details: {""Product Dimensions"": ""1.6 x...",[]
3,"Product_Details: {""Product Dimensions"": ""5.7 x...",[Headline:Cell Phone Cover - Slim Fit - Hard S...
4,"Product_Details: {""Product Dimensions"": ""4.7 x...",[Headline:J-Plus Tempered Glass Screen Protect...


In [34]:
amzn_2023["Full_review"]=amzn_2023["Full_review"].apply(lambda x: " ".join(x))

In [35]:
amzn_2023.head()

Unnamed: 0,Product_context,Full_review
0,"Product_Details: {""Product Dimensions"": ""6 x 4...",Headline:Generic MC0148 Cell Phone Case for iP...
1,"Product_Details: {""Package Dimensions"": ""7.56 ...",Headline:Nokia CC-1005 Skin for N8 - Blue\nDet...
2,"Product_Details: {""Product Dimensions"": ""1.6 x...",
3,"Product_Details: {""Product Dimensions"": ""5.7 x...",Headline:Cell Phone Cover - Slim Fit - Hard Sh...
4,"Product_Details: {""Product Dimensions"": ""4.7 x...",Headline:J-Plus Tempered Glass Screen Protecto...


##### **Youtube & Reddit Modular Scrapping**

In [36]:
load_dotenv()
yt_api_key = os.getenv("youtube_api")
youtube = YoutubeScraper("iphone 16 updates", youtube_api_key=yt_api_key, max_comments=100, sort_by="likeCount")

In [37]:
youtube_df = youtube.fetch_data()

Fetching comments for video: 1Yg5GOospcE Title: iPhone 16 / 16 Plus - TIPS, TRICKS & HIDDEN FEATURES!! + iOS 18
Fetching comments for video: W2aDh-Dbjcs Title: iOS 26 vs iOS 26 (2) - Apple‚Äôs listening!
Fetching comments for video: VG1gfHJjoR0 Title: Apple ios 16 vs ios 17 update || which one is better
Fetching comments for video: GJDFn68yVT0 Title: The iPhone 16 Has a BIG Problem
Fetching comments for video: ep2ptxlBqw8 Title: Apple Intelligence in iPhone 16 series #iphone16 #apple #ai


In [38]:
client_secret = os.getenv("client_secret")
client_id = os.getenv("client_id")
user_agent = os.getenv("user_agent")

reddit = RedditScraper(query="iphone 16 updates", client_id=client_id, client_secret=client_secret, user_agent=user_agent, subreddit=["productreview","technology","gadgets","photography","cameras"], max_results=500)

In [39]:
reddit_df = reddit.fetch_data()

Fetching from r/productreview
Fetching from r/technology
Fetching from r/gadgets
Fetching from r/photography
Fetching from r/cameras


##### **Data Consolidation & Integration**

In [40]:
youtube_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Video_ID     460 non-null    object
 1   Video_title  460 non-null    object
 2   Author       460 non-null    object
 3   Text         460 non-null    object
 4   PublishedAt  460 non-null    object
 5   LikeCount    460 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 21.7+ KB


In [41]:
youtube_df['Full_text'] = 'Video Title: ' + youtube_df['Video_title'] + '\n' +' | Comment:' + youtube_df['Text']

In [42]:
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812 entries, 0 to 811
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     812 non-null    object        
 1   review    812 non-null    object        
 2   source    812 non-null    object        
 3   date      812 non-null    datetime64[ns]
 4   url       812 non-null    object        
 5   upvotes   812 non-null    int64         
 6   comments  812 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 44.5+ KB


In [43]:
social_media_corpus = []

text = pd.concat([youtube_df['Text'], reddit_df['review']])
for review in text:
    if len(review.split())>5:
        social_media_corpus.append(review)

In [44]:
social_media_corpus_compiled = ". ".join(social_media_corpus)

In [45]:
social_media_corpus_compiled



##### **Cleaning and preprocessing the data**

In [46]:
### cleaning the corpus
def remove_emojis(text):
    emoji_pattern = re.compile(
    "[" 
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & Map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags
        u"\U00002500-\U00002BEF"  # Chinese characters + misc symbols
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # Dingbats
        u"\u3030"
        "]+",
    flags=re.UNICODE)

    return emoji_pattern.sub(r"", text)
    
def clean_corpus(text):

    """
    Comprehensive text cleaning function that is built to handle
    - Newlines and tabs
    - Irrelevant prefix and suffix (e.g skip comments)
    - Javascript snippets
    - URLs
    - Special characters and excessive whitespace
    - Short sentences
    """
    if not isinstance(text,str):
        return ""

    # Remove emojis
    text = remove_emojis(text)

    
    # Remove newlines, tabs, and excessive whitespaces
    text = " ".join(text.split())

    # Removing javascript snippets and HTML tags
    text = re.sub(r'{.*?}', "", text)
    text = re.sub(r'href.*?\)', "", text)
    text = re.sub(r'<.*?>', "", text)

    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Remove special characters
    text = re.sub(r"""[^\w\s.,;:!?'"-]""", '', text)

    # Remove standalone single/double quotes
    text = re.sub(r'\s[\'"]\s',' ', text)

    # Remove trailing/leading whitespaces
    text = text.strip()

    return text

def refine_corpus(corpus, min_word_length=5):

    """
    Refining corpus by splitting into robust sentences, applying comprehensive text cleaning and filtering the corpus length
    """

    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', corpus)

    cleaned_reviews = []
    for sentence in sentences:
        # Cleaning corpus
        clean_sent = clean_corpus(sentence)

        if len(clean_sent.split()) >= min_word_length:
            # Ensuring sentences end with a proper punctuation
            if not clean_sent.endswith(('.','?','!')):
                clean_sent +='.'
            cleaned_reviews.append(clean_sent)    
    
    return  cleaned_reviews

##### **Social Media Data Cleaning**

In [47]:
sm_processed_corpus = refine_corpus(social_media_corpus_compiled)

In [48]:
sm_processed_corpus[:5]

["Don't forget to check some epic accessories from ESR Shop ESR Qi2 MagSafe Battery Pack with Kickstand:  Shop ESR 3-in-1 MagSafe Charger with Qi2  CryoBoost:  Shop ESR MagSafe Car Charger with Qi2  CryoBoost:  esr esrmagsafe esrbatterypack.",
 'Just got mine yesterday, this video is very essential,I love you .',
 'Iphone 16 plus i am using since 8 months 315 cycles battery health suddenly became 93 please give me solution anyone.',
 'I this new iphone 165.',
 'Thanks a lot nice video .']

##### **Amazon Data Processing**

In [49]:
amzn_2023.head()
#amzn_2015.head()

Unnamed: 0,Product_context,Full_review
0,"Product_Details: {""Product Dimensions"": ""6 x 4...",Headline:Generic MC0148 Cell Phone Case for iP...
1,"Product_Details: {""Package Dimensions"": ""7.56 ...",Headline:Nokia CC-1005 Skin for N8 - Blue\nDet...
2,"Product_Details: {""Product Dimensions"": ""1.6 x...",
3,"Product_Details: {""Product Dimensions"": ""5.7 x...",Headline:Cell Phone Cover - Slim Fit - Hard Sh...
4,"Product_Details: {""Product Dimensions"": ""4.7 x...",Headline:J-Plus Tempered Glass Screen Protecto...


In [50]:
def process_data(data):
    
    processed_review_amzn = []
    processed_prod_amzn = []
    
    for review in data["Full_review"].dropna():
    
        #if isinstance(review, np.ndarray) and len(review)>0:
            #review = review[0]
            
        if isinstance(review, str) and len(review.split())>10:
            processed_review_amzn.append(review)  
    
    for review in data["Product_context"].dropna():
    
        #if isinstance(review, np.ndarray) and len(review)>0:
            #review = review[0]
            
        if isinstance(review, str) and len(review.split())>10:
            processed_prod_amzn.append(review)  
    
    amzn_review_compiled = ". ".join(processed_review_amzn)
    amzn_prod_compiled = ". ".join(processed_prod_amzn)
    
    cleaned_amzn_review = refine_corpus(amzn_review_compiled)
    cleaned_amzn_prod = refine_corpus(amzn_prod_compiled)

    return cleaned_amzn_review, cleaned_amzn_prod

In [51]:
amzn_review_data_2023, amzn_prod_data_2023 = process_data(amzn_2023)

In [52]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openai

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
#from langchain.output_parsers import OutputParserException

##### **Modelling**

In [53]:
# Response schema for social media sentiment
response_schemas_1 = [
    ResponseSchema(name="Social media sentiment", description='3 bullet points summarizing major highlights about this product'),
    ResponseSchema(name="Sentiment scores (%)", description='Positive-score|Negative-score|Neutral-score')   
]
output_parser_1 = StructuredOutputParser.from_response_schemas(response_schemas_1)
format_instructions_1 = output_parser_1.get_format_instructions()

# Response schema for amazon review sentiment
response_schemas_2 = [
    ResponseSchema(name='Amazon review sentiment', description='3 bullet points summarizing major highlights about this product'),
    ResponseSchema(name="Sentiment scores (%)", description='Positive-score|Negative-score|Neutral-score'),    
]
output_parser_2 = StructuredOutputParser.from_response_schemas(response_schemas_2)
format_instructions_2 = output_parser_2.get_format_instructions()

# Final recommendation schema
response_schemas_3 = [
    ResponseSchema(name="Recommendation", description='3 detailed recommendations for minimalist, balanced and maximalist preferences')]

output_parser_3 = StructuredOutputParser.from_response_schemas(response_schemas_3)
format_instructions_3 = output_parser_3.get_format_instructions()

In [54]:
openai_key = os.getenv("openai_api")

##### **Use cases**

##### **2023 Data**

In [55]:
product_pipeline = ProductReview("Cameras", "Best camera option for a big event", amzn_review_data_2023, amzn_prod_data_2023, sm_data = sm_processed_corpus, 
                                 state = ["social_media", "amazon", "final"], 
                 format_instructions_1=format_instructions_1, format_instructions_2=format_instructions_2, format_instructions_3=format_instructions_3, 
                 openai_key=openai_key, 
                 output_parser_1=output_parser_1, output_parser_2=output_parser_2, output_parser_3=output_parser_3
                )

In [56]:
product_pipeline.run_models()

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 681

Modelling Social Media Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 3169

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

------------------------------

{'Social media sentiment': ['- Users are looking for a camera that can take quality photos and has the option for pre-edited JPG files or in-camera film recipes',
  '- Some users are comparing the option of buying a $700-800 camera versus using an iPhone 16 Pro Max for their needs',
  '- Event hashtags are mentioned as a way to organize and share photos from big events'],
 'Sentiment scores (%)': '60|20(-)|20',
 'Amazon review sentiment': ['- Customers praise the extendable selfie monopod stick for smartphones, digital cameras, and action cameras',
  '- Complaints about the caution mentioned in the telescoping extension pole review',
  '- Positive feedback on the ST-75 extendable telescopic handheld monopod for GoPro Hero cameras'],
 'Recommendation': ['- DOOGEE S95 Pro is a great choice for a big event due to its rugged design, waterproof features, and high-quality camera with AI capabilities.',
  '- TL Prince PopSockets can also be a helpful accessory for capturing photos at events w

In [None]:
product_pipeline = ProductReview("Looking for a good phone with quality camera, storage and processor", amzn_review_data_2023, amzn_prod_data_2023, sm_data = sm_processed_corpus, 
                                 state = ["social_media", "amazon", "final"],)

In [186]:
product_pipeline.run_models()

First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 897

Modelling Social Media Sentiment...
Result Generation Completed 
Result: {
    "Social media sentiment": [
        "- Looking for a relatively cheap camera with clear quality for travel photos and posts.",
        "- Sometimes unable to connect to the phone via wi-fi.",
        "- Looking for suggestions for a new camera and how to take good sky pics."
    ],
    "Sentiment scores (%)": "0|0|100"
}
Now Parsing...
Parsing completed
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 815

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result: {
    "Amazon review sentiment": [
        "- Good quality sound on inexpensive headphone",
        "- Very fast processor with LGA2011 socket",
        "- Comes with good quality storage bag and instructi

{'Social media sentiment': ['- Looking for a relatively cheap camera with clear quality for travel photos and posts.',
  '- Sometimes unable to connect to the phone via wi-fi.',
  '- Looking for suggestions for a new camera and how to take good sky pics.'],
 'Sentiment scores (%)': '70|0|30',
 'Amazon review sentiment': ['- Good quality sound on inexpensive headphone',
  '- Very fast processor with LGA2011 socket',
  '- Comes with good quality storage bag and instruction sheet'],
 'Recommendation': 'Consider purchasing the Sony DSXS300BTX digital media receiver for good quality sound and Bluetooth connectivity'}

In [188]:
product_pipeline = ProductReview("Should I get an iphone 6 or something better",amzn_reviews=amzn_review_data, amzn_prod_details=amzn_prod_data)

In [190]:
product_pipeline.run_models()

First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 615

Modelling Social Media Sentiment...
Result Generation Completed 
Result: {
    "Social media sentiment": [
        "- Excited about honeymoon in Europe",
        "- Considering upgrading to iPhone 13 Pro Max",
        "- Unsure about camera options for photography"
    ],
    "Sentiment scores (%)": "50|10|40"
}
Now Parsing...
Parsing completed
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 587

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result: {
    "Amazon review sentiment": [
        "- Convenient to connect to iPhone or laptop",
        "- Better for using on the job than an iPhone or iPod touch",
        "- Some issues with fitting in an iPhone 6 lifeproof case port"
    ],
    "Sentiment scores (%)": "60|10|30"
}
Now Parsing..

{'Social media sentiment': ['- Excited about honeymoon in Europe',
  '- Considering upgrading to iPhone 13 Pro Max',
  '- Unsure about camera options for photography'],
 'Sentiment scores (%)': '60|10|30',
 'Amazon review sentiment': ['- Convenient to connect to iPhone or laptop',
  '- Better for using on the job than an iPhone or iPod touch',
  '- Some issues with fitting in an iPhone 6 lifeproof case port'],
 'Recommendation': 'Upgrade to iPhone 13 Pro Max for better camera options and connectivity'}

##### 2023 Data

In [173]:
query = "Laptop"
sub_query = "What kind of laptop should I buy for someone interested in high processing power and graphics for gaming"
product_pipeline = ProductReview(query, sub_query, amzn_reviews=amzn_review_data_2023, amzn_prod_details=amzn_prod_data_2023)

In [175]:
product_pipeline.run_models()

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 1264

Modelling Social Media Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 1128

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

-----------------------------

{'Social media sentiment': ['- Users appreciate the high processing power and RAM of the 16 Pro for gaming',
  "- Some users are frustrated with the laptop's inability to accept certain flash cards",
  '- There are conflicting explanations from representatives regarding optical zoom capabilities'],
 'Sentiment scores (%)': '60|30|10',
 'Amazon review sentiment': ['- The laptop offers high processing power and graphics performance, ideal for gaming enthusiasts',
  '- Users praise the GPU Turbo feature for boosting graphics efficiency and stability during gaming',
  '- Some customers mention concerns about overheating during prolonged gaming sessions'],
 'Recommendation': ['- Surface Laptop Studio is suitable because it offers high processing power and graphics performance, ideal for gaming enthusiasts.',
  '- HP Envy X360 is also a good fit due to its powerful GPU Turbo feature for boosting graphics efficiency and stability during gaming.']}

In [176]:
query = "Digital Camera"
sub_query = "Looking for a techy event camera (not mobile) that I can sync nicely with other gadgets"
product_pipeline = ProductReview(query, sub_query, amzn_reviews=amzn_review_data_2023, amzn_prod_details=amzn_prod_data_2023)

In [177]:
product_pipeline.run_models()

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 465

Modelling Social Media Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 2164

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

------------------------------

{'Social media sentiment': ['- Users appreciate the digital stabilization feature for clear quality travel photos',
  '- Users are looking for a durable camera that can withstand day trips and nights out',
  '- Users desire a camera that can handle sunlight well'],
 'Sentiment scores (%)': '60|20|20',
 'Amazon review sentiment': ['- Customers praise the Panasonic LUMIX DMC-TS25 Digital Camera for its durability and waterproof features',
  "- Some users mention issues with the camera's image quality in low light conditions",
  '- The compact camera case receives positive feedback for its protective design and included hand strap'],
 'Recommendation': ['- The Panasonic LUMIX DMC-TS25 Digital Camera is a great fit for your needs as it offers durability and waterproof features, making it suitable for techy events and syncing with other gadgets.',
  '- Another option to consider is the Samsung Galaxy S23 Ultra, known for its high-quality photographic capabilities and compatibility with othe

In [178]:
query = "Charger"
sub_query = "Super fast charger"
product_pipeline = ProductReview(query, sub_query, amzn_reviews=amzn_review_data_2023, amzn_prod_details=amzn_prod_data_2023)

In [179]:
product_pipeline.run_models()

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 1017

Modelling Social Media Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 4918

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

-----------------------------

{'Social media sentiment': ["- Users praise the fast charging capabilities of Ugreen's Nexode 140W Charger",
  '- Some users express excitement about the innovative features of ESR MagSafe accessories',
  '- One user mentions being new to using non-iPhone devices and questions the speed of their USB 2 reader'],
 'Sentiment scores (%)': '80|10|10',
 'Amazon review sentiment': ['- Customers praise the BoxWave Charger for its rapid-charging capabilities, allowing for quick use of devices',
  '- The charger is commended for its compatibility with both North American and international voltage standards, making it ideal for travelers',
  '- Users appreciate the green LED indicator that ensures proper charging of devices'],
 'Recommendation': ["- The Ugreen's Nexode 140W Charger is a great choice for a super fast charger, as users praise its fast charging capabilities.",
  '- The BoxWave Charger is another excellent option known for its rapid-charging capabilities and compatibility with inter

----------------------------