### E-Commerce Multisource Review Product Analysis & QA

##### Importing key libraries

In [20]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import requests
from dotenv import load_dotenv 
from datetime import datetime
import cohere
import faiss
from tqdm import tqdm
import praw

import spacy
import torch
import asyncio
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio

import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline, AutoModelForSequenceClassification, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration, OpenAI

from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

import openai
import deepseek

from datasets import load_dataset

from googleapiclient.discovery import build
import pandas as pd
import time

##### Getting Started With Amazon

In [31]:
#This project would be niched to tech and electronics product with a more prototyped focus on a small dataset. This confif paramater can be adjusted to larger dataset based on available memory
#Other configs include: 'raw_meta_Electronics' [5+GB], 'raw_meta_Cell_Phones_and_Accessories' [4+GB]

folder_path = "/Users/emmanueladeleye/Downloads/LLM Projects/E-Commerce/Amazon-Phone-Accessories '23/"
# Creating a list of all files that ends with .parquet

parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

dfs = []
# Looping through each tsv file and reading each one
for file in parquet_files:
    file_path = os.path.join(folder_path, file)
    df_2023 = pd.read_parquet(file_path, engine='pyarrow')

    dfs.append(df_2023)
# Merging all dataframes    
merged_df_2023 = pd.concat(dfs, ignore_index=True)
print(f'Merged {len(parquet_files)} files with {merged_df_2023.shape[0]} rows.')

Merged 6 files with 1104420 rows.


In [25]:
# Dataset directory: https://www.kaggle.com/datasets/cynthiarempel/amazon-us-customer-reviews-dataset/data

folder_path = "/Users/emmanueladeleye/Downloads/LLM Projects/E-Commerce/Amazon-Tech-Electronics '15/"

# Creating a list of all files that ends with .tsv
tsv_files = [f for f in os.listdir(folder_path) if f.endswith('.tsv')]

dfs = []

# Looping through each tsv file and reading each one
for file in tsv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, sep='\t',  encoding='utf-8', on_bad_lines='skip')
    dfs.append(df)

# Merging all dataframes    
merged_df = pd.concat(dfs, ignore_index=True)
print(f'Merged {len(tsv_files)} files with {merged_df.shape[0]} rows.')

Merged 4 files with 5337970 rows.


In [33]:
merged_df.product_category.value_counts()

product_category
Electronics           3091024
Camera                1800845
Software               341249
Mobile_Electronics     104850
2013-02-01                  1
2013-01-06                  1
Name: count, dtype: int64

In [35]:
merged_df.review_date.value_counts()

review_date
2015-01-03    8751
2015-01-05    8526
2014-12-29    8413
2015-01-07    8086
2015-01-04    7865
              ... 
1999-07-11       1
1999-05-02       1
1999-08-14       1
1999-08-11       1
1998-09-21       1
Name: count, Length: 5919, dtype: int64

In [36]:
merged_df['review_date'] = pd.to_datetime(merged_df['review_date'])

In [38]:
merged_df = merged_df[merged_df["review_date"]>="2010-01-01"]

In [40]:
merged_df["star_rating"].value_counts()

star_rating
5.0    2745906
4.0     812670
1.0     527138
3.0     366384
2.0     259014
Name: count, dtype: int64

In [41]:
df = merged_df_2023.copy()

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104420 entries, 0 to 1104419
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   main_category    992332 non-null   object 
 1   title            1104420 non-null  object 
 2   average_rating   1104420 non-null  float64
 3   rating_number    1104420 non-null  int64  
 4   features         1104420 non-null  object 
 5   description      1104420 non-null  object 
 6   price            1104420 non-null  object 
 7   images           1104420 non-null  object 
 8   videos           1104420 non-null  object 
 9   store            1088240 non-null  object 
 10  categories       1104420 non-null  object 
 11  details          1104420 non-null  object 
 12  parent_asin      1104420 non-null  object 
 13  bought_together  0 non-null        object 
 14  subtitle         1772 non-null     object 
 15  author           34 non-null       object 
dtypes: float64(1), int

In [43]:
df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Cell Phones & Accessories,Generic MC0148 Cell Phone Case for iPhone 6 - ...,3.5,3,"[100% Brand new and high quality, Compatible i...","[For iPhone 6 Case, Fashion Design Black High ...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Generic,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""6 x 4 x 4 inches"", ""It...",B00ORO8G7O,,,
1,Cell Phones & Accessories,32nd Designer book wallet PU leather case cove...,2.0,2,[Stratholme],[Stratholme],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",AMZ Original,"[Cell Phones & Accessories, Accessories]","{""Package Dimensions"": ""4.8 x 3.3 x 0.9 inches...",B00L4CJMNY,,,
2,Cell Phones & Accessories,Casemachine i5 Slimline Comp (White / White),2.0,1,"[TightStretch Silicone Polymer Bumper, Injecti...",[The All-New Casemachine i5 Slimline Comp Case...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Casemachine,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Special features"": ""Scratch resistant, Light...",B00IAS44RO,,,
3,Cell Phones & Accessories,"Moto Z2 Play Case, Harryshell Shock Absorption...",4.7,11,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Harryshell,[],"{""Product Dimensions"": ""0.79 x 1.18 x 0.79 inc...",B0747KLTL5,,,
4,Cell Phones & Accessories,"New iPhone 6 Plus 6s Plus (5.5"") Case, Darth V...",5.0,1,[],[],,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",APEX,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Package Dimensions"": ""6.5 x 5 x 0.5 inches"",...",B01BLT1DYK,,,


In [44]:
df["main_category"].value_counts()

main_category
Cell Phones & Accessories       883406
All Electronics                  39799
AMAZON FASHION                   23479
Computers                         6873
Books                             5010
Camera & Photo                    4876
Sports & Outdoors                 4682
Industrial & Scientific           4565
Amazon Home                       4358
Home Audio & Theater              2603
Portable Audio & Accessories      2467
Office Products                   1621
Tools & Home Improvement          1342
Musical Instruments               1247
Automotive                        1217
All Beauty                         935
Health & Personal Care             695
Toys & Games                       587
Arts, Crafts & Sewing              499
Car Electronics                    485
Video Games                        403
Amazon Devices                     281
Baby                               222
GPS & Navigation                   147
Pet Supplies                       147
Grocery    

In [45]:
df[df['main_category']=="Movies & TV"].iloc[0]["categories"]

array(['Cell Phones & Accessories', 'Cases, Holsters & Sleeves',
       'Armbands'], dtype=object)

In [46]:
df_2 = merged_df.copy()

In [47]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4711112 entries, 0 to 5236312
Data columns (total 15 columns):
 #   Column             Dtype         
---  ------             -----         
 0   marketplace        object        
 1   customer_id        int64         
 2   review_id          object        
 3   product_id         object        
 4   product_parent     int64         
 5   product_title      object        
 6   product_category   object        
 7   star_rating        float64       
 8   helpful_votes      float64       
 9   total_votes        float64       
 10  vine               object        
 11  verified_purchase  object        
 12  review_headline    object        
 13  review_body        object        
 14  review_date        datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(2), object(9)
memory usage: 575.1+ MB


In [48]:
df_2.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,2975964,R1NBG94582SJE2,B00I01JQJM,860486164,GoPro Rechargeable Battery 2.0 (HERO3/HERO3+ o...,Camera,5.0,0.0,0.0,N,Y,Five Stars,ok,2015-08-31
1,US,23526356,R273DCA6Y0H9V7,B00TCO0ZAA,292641483,Professional 58mm Center Pinch Lens Cap for CA...,Camera,5.0,0.0,0.0,N,Y,Love it!!!,"Perfect, even sturdier than the original!",2015-08-31
2,US,52764145,RQVOXO7WUOFK6,B00B7733E0,75825744,Spy Tec Z12 Motion Activated Intelligent Secur...,Camera,2.0,1.0,1.0,N,Y,Another Motion Detect Fail,"If the words, &#34;Cheap Chinese Junk&#34; com...",2015-08-31
3,US,47348933,R1KWKSF21PO6HO,B006ZN4U34,789352955,"Celestron UpClose G2 10x25 Monocular, Black (7...",Camera,5.0,0.0,0.0,N,Y,Exactly what I wanted and expected.,Exactly what I wanted and expected. Perfect fo...,2015-08-31
4,US,33680700,R38H3UO1J190GI,B00HUEBGMU,19067902,Vidpro XM-L Wired Lavalier microphone - 20' Au...,Camera,5.0,1.0,1.0,N,Y,Good mic at a Good Price...Not Canon Though.,I will look past the fact that they tricked me...,2015-08-31


In [49]:
df_2["product_category"].value_counts()

product_category
Electronics           2777481
Camera                1596141
Software               239584
Mobile_Electronics      97906
Name: count, dtype: int64

In [None]:
df[df["star_rating"]==3.0].head()

In [19]:
df_2[df_2["star_rating"]==3.0].head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
5,US,30301059,R3NPIFKLR19NQA,B008MW6Y12,597683407,NIX 8 inch Hi-Res Digital Photo Frame with Mot...,Camera,3.0,0.0,0.0,N,Y,"The controls are pretty slow, and I can't get ...","The controls are pretty slow, and I can't get ...",2015-08-31
6,US,28282645,R3MBE6UCH3435E,B00TE8XKIS,35563334,Polaroid ZIP Mobile Printer Parent ASIN,Camera,3.0,8.0,8.0,N,N,"Fun toy for making stickers, but expensive to ...",The printer came in a small fairly plain box w...,2015-08-31
12,US,11438825,R1F4O6WOO2W461,B00X3HIM2U,444991975,Neewer Meike MK-XT1 Battery Grip (Replacement ...,Camera,3.0,3.0,3.0,N,Y,Can't grip the battery.,Same issue as others the battery won't stay la...,2015-08-31
38,US,46588192,R19WH1XJ3P76NV,B00A7AY3WE,183171237,-,Camera,3.0,0.0,0.0,N,Y,Three Stars,"There okay, use them here and there....",2015-08-31
42,US,15899329,R3VBAJTKZG737O,B0046V58Y2,350358427,Wasabi Power Battery and Charger for Canon BP-...,Camera,3.0,0.0,1.0,N,Y,"it works, but no power level indicator.",No power level indicator. not that most 3rd p...,2015-08-31


In [14]:
cat_wgt = df_2["product_category"].value_counts()/len(df_2)

In [21]:
cat_wgt["Electronics"]

0.589559534988767

In [15]:
# Observation shows that ratings between 3-4 usually contain more details about products and its sentiment
df = df_2[(df_2["star_rating"] > 3) & (df_2["star_rating"] <= 4)]

In [50]:
# Observation shows that ratings between 3-4 usually contain more details about products and its sentiment
df_2023 = df[(df["average_rating"] > 3) & (df["average_rating"] <= 4)]

In [53]:
len(df_2023)

395343

In [16]:
# Creating a sample function that helps generate a close distribution of categories to the population
electronics_wgt = cat_wgt["Electronics"]
mobile_elect_wgt = cat_wgt["Mobile_Electronics"]
camera_wgt = cat_wgt["Camera"]
software_wgt = cat_wgt["Software"]

def sampling_data(sample_size):

    print("Extracting electronics sample...")
    electronics_category = df[df["product_category"]=="Electronics"].sample(n=int(electronics_wgt*sample_size), random_state=16)
    print("Completed")

    print("Extracting mobile electronics sample...")
    mobile_elect_category = df[df["product_category"]=="Mobile_Electronics"].sample(n=int(mobile_elect_wgt*sample_size), random_state=16)
    print("Completed")

    print("Extracting camera sample...")
    camera_category = df[df["product_category"]=="Camera"].sample(n=int(camera_wgt*sample_size), random_state=16)
    print("Completed")

    print("Extracting software sample...")
    software_category = df[df["product_category"]=="Software"].sample(n=int(software_wgt*sample_size), random_state=16)
    print("Completed")
    
    sampled_df = pd.concat([electronics_category, mobile_elect_category, camera_category, software_category], axis=0)

    return sampled_df

In [17]:
new_df = sampling_data(500_000)

Extracting electronics sample...
Completed
Extracting mobile electronics sample...
Completed
Extracting camera sample...
Completed
Extracting software sample...
Completed


In [25]:
new_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
3176462,US,38242941,R3H44H40O9GAOP,B008JGR9MO,935496572,OontZ Angle Bluetooth Speaker [ORIGIINAL] Ultr...,Electronics,4.0,0.0,0.0,N,Y,Connected easily with my galaxy note,The oontz angle exceeded my expectations in so...,2014-07-16
4091686,US,50886668,R5P0YT0Z6RKPJ,B002QPQNLC,5700079,iLuv iSP110 Portable Amplified Stereo Speaker ...,Electronics,4.0,0.0,0.0,N,Y,Solid product,I use these speakers to play music while I'm w...,2013-01-06
2802779,US,18214754,R1Q2MNEYCM0PR6,B001DJVQQE,101930743,Atlantic 63712035 Nestable 52 DVD/BluRay Games...,Electronics,4.0,0.0,0.0,N,Y,Four Stars,Nice stand. Good for holding DVDs and blurays.,2014-12-07
2183290,US,51624294,R2ML5F53AZ9E5Z,B004AOHMJY,860656485,AC Adapter Power Cord charger DELL Studio PP31...,Electronics,4.0,0.0,0.0,N,Y,Four Stars,Great replacement power cord. Fit and worked l...,2015-06-05
3032884,US,27228524,R8T8UXKFF8EVT,B00E4OFYKY,90387457,Waterproof Bluetooth Shower Speaker Splash Tun...,Electronics,4.0,0.0,0.0,N,Y,Not loud enough,"I really like having music in my shower, but t...",2014-09-09


In [26]:
new_df["product_category"].value_counts()

product_category
Electronics           294779
Camera                169401
Software               25427
Mobile_Electronics     10390
Name: count, dtype: int64

In [27]:
len(new_df)

499997

In [28]:
#df = df.sort_values(by="average_rating", ascending=True)
#df = df[(df["average_rating"] >= 3) & (df["average_rating"] <= 4)]
#df = df.sample(n=100_000, random_state=42)  # or adjust as needed

In [29]:
#df['main_category'].value_counts()

In [30]:
#len(df)

In [31]:
#merged_df_2023['main_category'].value_counts()

In [59]:
def process_in_chunks(df, chunk_size, process_fn, d_type):
    chunks = []
    n=len(df)
    for start in range(0, n, chunk_size):
        end = start + chunk_size
        chunk = df.iloc[start:end].copy()
        processed = process_fn(chunk, d_type)
        chunks.append(processed)

    return pd.concat(chunks, ignore_index=True)    

def add_review_features(df, dtype):

    if dtype == 'old':
        df["Product_context"] = "Product_Details: " + df["product_title"] + "\nRating: " + df["star_rating"].astype(str)
        df["Full_review"] = "Product Title:" + df["product_title"] + "Headline:" + df["review_headline"] + "\nDetails + Review: " + df["review_body"]
    
        return df[["Product_context", "Full_review"]]
    
    elif dtype == 'new':
        df["Product_context"] = "Product_Details: " + df["details"] + "\nRating: " + df["average_rating"].astype(str)
        df["Full_review"] = "Headline:" + df["title"] + "\nDetails + Review: " + df["description"]
    
        return df[["Product_context", "Full_review"]]

In [153]:
chunk_size=100_000

#amzn_old = process_in_chunks(merged_df, chunk_size, add_review_features, "old")
#amzn_2023 = process_in_chunks(df, chunk_size, add_review_features, "new")

amzn_2015 = process_in_chunks(new_df, chunk_size, add_review_features, "old")

In [60]:
amzn_2023 = process_in_chunks(df_2023, chunk_size, add_review_features, "new")

In [265]:
len(amzn_2023), len(amzn_2015)

(339060, 499997)

In [268]:
amzn_2023.head()

Unnamed: 0,Product_context,Full_review
0,"Product_Details: {""Product Dimensions"": ""6 x 4...",[Headline:Generic MC0148 Cell Phone Case for i...
1,"Product_Details: {""Package Dimensions"": ""7.56 ...",[Headline:Nokia CC-1005 Skin for N8 - Blue\nDe...
2,"Product_Details: {""Product Dimensions"": ""1.6 x...",[]
3,"Product_Details: {""Product Dimensions"": ""5.7 x...",[Headline:Cell Phone Cover - Slim Fit - Hard S...
4,"Product_Details: {""Product Dimensions"": ""4.7 x...",[Headline:J-Plus Tempered Glass Screen Protect...


In [65]:
amzn_2023["Full_review"]=amzn_2023["Full_review"].apply(lambda x: " ".join(x))

In [66]:
amzn_2023.head()

Unnamed: 0,Product_context,Full_review
0,"Product_Details: {""Product Dimensions"": ""6 x 4...",Headline:Generic MC0148 Cell Phone Case for iP...
1,"Product_Details: {""Package Dimensions"": ""7.56 ...",Headline:Nokia CC-1005 Skin for N8 - Blue\nDet...
2,"Product_Details: {""Product Dimensions"": ""1.6 x...",
3,"Product_Details: {""Product Dimensions"": ""5.7 x...",Headline:Cell Phone Cover - Slim Fit - Hard Sh...
4,"Product_Details: {""Product Dimensions"": ""4.7 x...",Headline:J-Plus Tempered Glass Screen Protecto...


##### Youtube & Reddit Modular Scrapping

In [68]:
class BaseScraper():
    def __init__(self, query, max_results=100):
        self.query = query
        self.max_results = max_results

    def fetch_data(self):
        raise NotImplementedError("Subclasses must implement this method")

class RedditScraper(BaseScraper):
    def __init__(self, query, client_id, client_secret, user_agent, subreddit, max_results):
        super().__init__(query, max_results)
        self.client_id = client_id
        self.client_secret = client_secret
        self.user_agent = user_agent
        self.subreddit = subreddit if subreddit else ['productreview']

    def _fetch_from_subreddit(self, reddit, subreddit_name):
        sub = reddit.subreddit(subreddit_name)
        posts = sub.search(self.query, limit=self.max_results)
    
        # Parsing the posts
        parsed_posts = []
        for post in posts:
            parsed_posts.append({
                "title": post.title,
                "review": post.selftext,
                "source": f"Reddit r/ {subreddit_name}",
                "date": datetime.fromtimestamp(post.created_utc),
                "url": f"https://reddit.com{post.permalink}",
                "upvotes": post.score,
                "comments": post.num_comments
            })
            
        df = pd.DataFrame(parsed_posts)
        
        return df
        
    def fetch_data(self):
        """
        Fetches reviews from a subreddit based on a query.
        
        Args:
            query (str): Keyword to search for (e.g., "iPhone").
            subreddit (str): Subreddit to search in (default: "productreview").
            limit (int): Maximum number of posts to fetch (default: 1000).
        
        Returns:
            List of dicts with post details.
        """
        # Initializing Reddit API client
        reddit = praw.Reddit(client_id=self.client_id, client_secret=self.client_secret, user_agent=self.user_agent)
        full_df = []
        
        for subreddit_name in self.subreddit:
            try:
                print(f"Fetching from r/{subreddit_name}")
                subreddit_name_df = self._fetch_from_subreddit(reddit, subreddit_name)
                #sorted_df = df.sort_values(by='upvotes', ascending=False).reset_index(drop=True)
                #slice_df = sorted_df[:int(percent*len(sorted_df))]
                full_df.append(subreddit_name_df)
            except Exception as e:
                print(f"Error fetching r/{subreddit_name}: {e}")
         
        if not full_df:
            return pd.DataFrame()

        all_df = pd.concat(full_df, ignore_index=True).drop_duplicates()
        return all_df

class YoutubeScraper(BaseScraper):
    def __init__(self, query, youtube_api_key, max_comments, sort_by, max_videos=10, top_n=5):
        super().__init__(query, max_videos)
        self.query = query
        self.youtube_api_key = youtube_api_key
        self.max_comments = max_comments
        self.top_n = top_n
        self.sort_by = sort_by
        self.youtube = self._get_youtube_service()

    def _get_youtube_service(self):
        return build('youtube', 'v3', developerKey=self.youtube_api_key)
    
    def _search_youtube_videos(self):
        # Step 2: Search for videos
        search_response = self.youtube.search().list(
            q=self.query,
            part="id",
            type="video",
            maxResults=self.max_results # because the Base attribute is set to max_results for max_videos object
        ).execute()
    
        video_ids = [item['id']['videoId'] for item in search_response['items']]
        
        if not video_ids:
            return pd.DataFrame()  # No results
        
        # Step 3: Get video stats
        videos_response = self.youtube.videos().list(
            part="snippet,statistics",
            id=",".join(video_ids)
        ).execute()
    
        # Step 4: Collect and sort by views
        video_data = []
        for item in videos_response['items']:
            stats = item['statistics']
            snippet = item['snippet']
            metric = int(stats.get(self.sort_by, 0)) if self.sort_by in stats else 0
            
            video_data.append({
                "video_id": item["id"],
                "title": snippet["title"],
                "views": int(stats.get("viewCount", 0)),
                "likes": int(stats.get("likeCount", 0)),
                "comments": int(stats.get("commentsCount", 0)),
                "published_at": snippet["publishedAt"],
                "sort_metric": metric
            })
    
        # Sort by view count and return top_n
        sorted_videos = sorted(video_data, key=lambda x: x["sort_metric"], reverse=True)
        video_details = [{'video_id':video['video_id'],'title':video['title']} for video in sorted_videos[:self.top_n]]
        
        return video_details    
        
    def _get_video_comments(self, vid, title):
        comments = []
        next_page_token = None
    
        while len(comments) < self.max_comments:
            response = self.youtube.commentThreads().list(
                part='snippet',
                videoId=vid,
                maxResults=min(100, self.max_comments - len(comments)),
                textFormat='plainText',
                pageToken=next_page_token
            ).execute()
    
            for item in response['items']:
                comment_data = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'Video_ID': vid,
                    'Video_title': title,
                    'Author': comment_data['authorDisplayName'],
                    'Text': comment_data['textDisplay'],
                    'PublishedAt': comment_data['publishedAt'],
                    'LikeCount': comment_data['likeCount']
                })
    
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
    
            time.sleep(0.2)  # avoid hitting API quota too fast
    
        return comments
    

    def fetch_data(self):
        video_details = self._search_youtube_videos()

        all_comments = []
        for video_detail in video_details:
            vid = video_detail['video_id']
            title = video_detail['title']
            print(f"Fetching comments for video: {vid} Title: {title}")
            all_comments.extend(self._get_video_comments(vid, title))

        return pd.DataFrame(all_comments)

In [69]:
api_key = "*******" # Enter youtube api key
youtube = YoutubeScraper("iphone 16 updates", youtube_api_key=api_key, max_comments=100, sort_by="likeCount")

In [72]:
youtube_df = youtube.fetch_data()

Fetching comments for video: qSCJCRyCTKY Title: Do this when you get an Iphone 16… #c#carterpcst#techt#techtoktechfacts #iphone16 #apple
Fetching comments for video: SLpBl0BdccU Title: Apple Intelligence BEST FEATURES! 🤯
Fetching comments for video: 1Yg5GOospcE Title: iPhone 16 / 16 Plus - TIPS, TRICKS & HIDDEN FEATURES!! + iOS 18
Fetching comments for video: GJDFn68yVT0 Title: The iPhone 16 Has a BIG Problem
Fetching comments for video: fVgSRA2RzBM Title: 4 Crazy New iPhone 16 Pro Features!


In [73]:
client_secret = "******" # Enter reddit developer client secret
client_id = "******" # Enter reddit developer client id
user_agent = "******" # Enter reddit developer user agent

reddit = RedditScraper(query="iphone 16 updates", client_id=client_id, client_secret=client_secret, user_agent=user_agent, subreddit=["productreview","technology","gadgets","photography","cameras"], max_results=500)

In [74]:
reddit_df = reddit.fetch_data()

Fetching from r/productreview
Fetching from r/technology
Fetching from r/gadgets
Fetching from r/photography
Fetching from r/cameras


##### Data Consolidation & Integration

In [76]:
youtube_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Video_ID     423 non-null    object
 1   Video_title  423 non-null    object
 2   Author       423 non-null    object
 3   Text         423 non-null    object
 4   PublishedAt  423 non-null    object
 5   LikeCount    423 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 20.0+ KB


In [77]:
youtube_df['Full_text'] = 'Video Title: ' + youtube_df['Video_title'] + '\n' +' | Comment:' + youtube_df['Text']

In [78]:
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 792 entries, 0 to 791
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   title     792 non-null    object        
 1   review    792 non-null    object        
 2   source    792 non-null    object        
 3   date      792 non-null    datetime64[ns]
 4   url       792 non-null    object        
 5   upvotes   792 non-null    int64         
 6   comments  792 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 43.4+ KB


In [79]:
social_media_corpus = []

text = pd.concat([youtube_df['Text'], reddit_df['review']])
for review in text:
    if len(review.split())>5:
        social_media_corpus.append(review)

In [81]:
social_media_corpus_compiled = ". ".join(social_media_corpus)

In [82]:
social_media_corpus_compiled



##### Cleaning and preprocessing the data

In [84]:
import re

In [85]:
### cleaning the corpus
def remove_emojis(text):
    emoji_pattern = re.compile(
    "[" 
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & Map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags
        u"\U00002500-\U00002BEF"  # Chinese characters + misc symbols
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # Dingbats
        u"\u3030"
        "]+",
    flags=re.UNICODE)

    return emoji_pattern.sub(r"", text)
    
def clean_corpus(text):

    """
    Comprehensive text cleaning function that is built to handle
    - Newlines and tabs
    - Irrelevant prefix and suffix (e.g skip comments)
    - Javascript snippets
    - URLs
    - Special characters and excessive whitespace
    - Short sentences
    """
    if not isinstance(text,str):
        return ""

    # Remove emojis
    text = remove_emojis(text)

    
    # Remove newlines, tabs, and excessive whitespaces
    text = " ".join(text.split())

    # Removing javascript snippets and HTML tags
    text = re.sub(r'{.*?}', "", text)
    text = re.sub(r'href.*?\)', "", text)
    text = re.sub(r'<.*?>', "", text)

    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Remove special characters
    text = re.sub(r"""[^\w\s.,;:!?'"-]""", '', text)

    # Remove standalone single/double quotes
    text = re.sub(r'\s[\'"]\s',' ', text)

    # Remove trailing/leading whitespaces
    text = text.strip()

    return text

def refine_corpus(corpus, min_word_length=5):

    """
    Refining corpus by splitting into robust sentences, applying comprehensive text cleaning and filtering the corpus length
    """

    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', corpus)

    cleaned_reviews = []
    for sentence in sentences:
        # Cleaning corpus
        clean_sent = clean_corpus(sentence)

        if len(clean_sent.split()) >= min_word_length:
            # Ensuring sentences end with a proper punctuation
            if not clean_sent.endswith(('.','?','!')):
                clean_sent +='.'
            cleaned_reviews.append(clean_sent)    
    
    return  cleaned_reviews

##### Social Media Data Cleaning

In [87]:
sm_processed_corpus = refine_corpus(social_media_corpus_compiled)

In [88]:
sm_processed_corpus[:5]

['Because people want it I can sell it.',
 'Put the phone in the trash as well.',
 "The only problem is that the trash can isn't going to just spit up 1200..",
 'I hold on to it for either storage for when I upgrade, or if I want to sell it..',
 'I have 7 iPhone boxes Carter computers.']

##### Amazon Data Processing

In [93]:
amzn_2023.head()
#amzn_2015.head()

Unnamed: 0,Product_context,Full_review
0,"Product_Details: {""Product Dimensions"": ""6 x 4...",Headline:Generic MC0148 Cell Phone Case for iP...
1,"Product_Details: {""Package Dimensions"": ""7.56 ...",Headline:Nokia CC-1005 Skin for N8 - Blue\nDet...
2,"Product_Details: {""Product Dimensions"": ""1.6 x...",
3,"Product_Details: {""Product Dimensions"": ""5.7 x...",Headline:Cell Phone Cover - Slim Fit - Hard Sh...
4,"Product_Details: {""Product Dimensions"": ""4.7 x...",Headline:J-Plus Tempered Glass Screen Protecto...


In [95]:
def process_data(data):
    
    processed_review_amzn = []
    processed_prod_amzn = []
    
    for review in data["Full_review"].dropna():
    
        #if isinstance(review, np.ndarray) and len(review)>0:
            #review = review[0]
            
        if isinstance(review, str) and len(review.split())>10:
            processed_review_amzn.append(review)  
    
    for review in data["Product_context"].dropna():
    
        #if isinstance(review, np.ndarray) and len(review)>0:
            #review = review[0]
            
        if isinstance(review, str) and len(review.split())>10:
            processed_prod_amzn.append(review)  
    
    amzn_review_compiled = ". ".join(processed_review_amzn)
    amzn_prod_compiled = ". ".join(processed_prod_amzn)
    
    cleaned_amzn_review = refine_corpus(amzn_review_compiled)
    cleaned_amzn_prod = refine_corpus(amzn_prod_compiled)

    return cleaned_amzn_review, cleaned_amzn_prod

In [97]:
amzn_review_data_2023, amzn_prod_data_2023 = process_data(amzn_2023)

In [98]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openai

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
#from langchain.output_parsers import OutputParserException

##### Modelling

In [100]:
# Response schema for social media sentiment
response_schemas_1 = [
    ResponseSchema(name="Social media sentiment", description='3 bullet points summarizing major highlights about this product'),
    ResponseSchema(name="Sentiment scores (%)", description='Positive-score|Negative-score|Neutral-score')   
]
output_parser_1 = StructuredOutputParser.from_response_schemas(response_schemas_1)
format_instructions_1 = output_parser_1.get_format_instructions()

# Response schema for amazon review sentiment
response_schemas_2 = [
    ResponseSchema(name='Amazon review sentiment', description='3 bullet points summarizing major highlights about this product'),
    ResponseSchema(name="Sentiment scores (%)", description='Positive-score|Negative-score|Neutral-score'),    
]
output_parser_2 = StructuredOutputParser.from_response_schemas(response_schemas_2)
format_instructions_2 = output_parser_2.get_format_instructions()

# Final recommendation schema
response_schemas_3 = [
    ResponseSchema(name="Recommendation", description='3 detailed recommendations for minimalist, balanced and maximalist preferences')]

output_parser_3 = StructuredOutputParser.from_response_schemas(response_schemas_3)
format_instructions_3 = output_parser_3.get_format_instructions()

In [186]:
openai_key = "*******" # Enter your openai key

class ProductReview:
    def __init__(self, query, sub_query,
                 amzn_reviews, amzn_prod_details,
                 state = ["social_media", "amazon", "final"], 
                 format_instructions_1=format_instructions_1, format_instructions_2=format_instructions_2, format_instructions_3=format_instructions_3, 
                 openai_key=openai_key, 
                 output_parser_1=output_parser_1, output_parser_2=output_parser_2, output_parser_3=output_parser_3,
                sm_data=sm_processed_corpus):
        
        self.query = query
        self.sub_query = sub_query
        self.state = state
        self.format_instructions_1 = format_instructions_1
        self.format_instructions_2 = format_instructions_2
        self.format_instructions_3 = format_instructions_3
        self.openai_key = openai_key
        self.output_parser_1 = output_parser_1
        self.output_parser_2 = output_parser_2
        self.output_parser_3 = output_parser_3
        self.sm_data = sm_data
        self.amzn_reviews = amzn_reviews
        self.amzn_prod_details = amzn_prod_details

    def _create_search_index(self, full_text):
        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        chunks = text_splitter.split_text(full_text)
    
        # creating searchable index
        embeddings = OpenAIEmbeddings(openai_api_key=self.openai_key)
        print("Embeddings vector created..")
        return FAISS.from_texts(chunks, embeddings)   
    
    def _analyze_text(self, relevant_text, state, ground_query):
        #client = openai.OpenAI(api_key=openai_key)
        llm = ChatOpenAI(
            model="gpt-3.5-turbo",
            temperature=0.3,
            openai_api_key=self.openai_key  # Pass key directly or use environment variable
        )
        if state == "social_media":
            # initiate llm model
            prompt = ChatPromptTemplate.from_template(
            """
            You are a product sentiment analysis assistant. A user is researching products related to this enquiry "{query}" .
            
            Your task:
            1. Analyze the social media corpus below.
            2. Summarize **3 grounded, specific observations** expressed by users related to product performance, experience, or satisfaction.
            3. Provide sentiment scores in the format: "Positive|Negative|Neutral" (no % symbol).
            
            Return the output strictly as valid JSON, with keys:
            - "Social media sentiment": list of 3 bullet points
            - "Sentiment scores (%)": string in format "##|##|##"
            
            Example:
            {{
              "Social media sentiment": [
                "- Gamers appreciate the high frame rates and thermal management",
                "- Several users report inconsistent GPU driver updates",
                "- Users say build quality feels premium and durable"
              ],
              "Sentiment scores (%)": "65(+)|20(-)|15"
            }}
            
            Output only valid JSON. Do not include commentary or formatting hints.
            
            Corpus:
            \"\"\"
            {text}
            \"\"\"
            """)
            
            print("Modelling Social Media Sentiment...")
            fmt = self.format_instructions_1
            parser = self.output_parser_1
        
        elif state == "amazon":
           prompt = ChatPromptTemplate.from_template(
            """
            You are a product sentiment analysis assistant. A user is researching products related to this enquiry "{query}".
            
            Your task:
            1. Analyze verified **Amazon product reviews**.
            2. Summarize 3 **key insights**, focused on product **performance, quality, satisfaction, or drawbacks** relevant to the query.
            3. Provide sentiment scores in the format: "Positive|Negative|Neutral" (no % symbol).
            
            Return the output strictly as valid JSON with:
            - "Amazon review sentiment": list of 3 grounded bullet points
            - "Sentiment scores (%)": string like "##|##|##"
            
            Example:
            {{
              "Amazon review sentiment": [
                "- Customers report the laptop handles AAA games smoothly at high settings",
                "- Complaints about overheating after prolonged gaming sessions",
                "- Battery life praised, lasting 6–8 hours during mixed usage"
              ],
              "Sentiment scores (%)": "70(+)|20(-)|10"
            }}
            
            Only output a valid JSON object.
            
            Corpus:
            \"\"\"
            {text}
            \"\"\"
            """)

           print("Modelling Amazon Review Sentiment...")
    
           fmt = self.format_instructions_2
           parser = self.output_parser_2
            
        
        elif state == "final":
           prompt = ChatPromptTemplate.from_template(
           """
            You are a product recommendation assistant. Return ONLY a valid JSON object.
        
            Task:
            Analyze the product corpus below (regarding "{query}") and provide a recommendation based on the user’s need: "{initial_query}".
        
            Requirements:
            - Extract the most relevant products from the corpus.
            - Justify each product choice using specific advantages or drawbacks mentioned in the corpus.
            - Tailor each recommendation to suit the user's intent (e.g., gaming, travel, durability, etc.).
            - Avoid repeating exact review lines — synthesize meaningful insights.
            - Limit recommendations to 2–3 options max, with 1–2 concise, informative sentences each.
        
            Output Format:
            {{
                "Recommendation": [
                    "- [Product Name] is suitable because [brief explanation based on corpus].",
                    "- [Optional Alternative Product] is also a good fit due to [reason]."
                ]
            }}
        
            Output MUST match the format exactly and be parseable by `json.loads()`.
        
            Corpus:
            \"\"\"
            {text}
            \"\"\"
            """
           )
            
           print("Modelling Final Recommendation...")
           fmt = self.format_instructions_3
           parser = self.output_parser_3
        
        else:
            print("Error: Enter valid stage staus")
        
        messages = prompt.format_messages(
                query = self.sub_query,
                initial_query = ground_query,
                text = relevant_text,
                format_instructions=fmt
            )
        
        response = llm(messages)
        raw_output = response.content
        cleaned_output = re.sub(r'```json|```', '', raw_output).strip()
        print("Result Generation Completed \nResult:\nNow Parsing...")

        try:
            parsed_result = parser.parse(raw_output)
            print("Parsing completed")
            print("--------------------------------------------------------------------------------------------------------------------\n")
            return parsed_result
        except Exception as e:
            print("Failed to parse:", cleaned_output)
            raise
            
    def _analyze_with_semantic_search(self, full_texts, state, ground_query=None):
        
        #full_texts = " ".join(full_texts) if isinstance(full_texts, list) else full_texts
    
        # retreiving relevant chunks
        #using lightweight BM25 pre-filter
        bm25_retriever = BM25Retriever.from_texts(full_texts)
        top_k_docs = bm25_retriever.get_relevant_documents(self.query, k=1000)
        reduced_corpus = [doc.page_content for doc in top_k_docs]
        print("--------------------------------------------------------------------------------------------------------------------")
        print("First layer BM25 Retriever Corpus Reduction Complete")
        # creating vector index on full corpus
        reduced_text = " ".join(reduced_corpus)
        index = self._create_search_index(reduced_text)
        
        faiss_retriever = index.as_retriever()
        ensemble_retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, faiss_retriever], 
            weights=[0.3,0.7]
        )
        
        relevant_docs = ensemble_retriever.get_relevant_documents(self.sub_query)
        relevant_text = " ".join([doc.page_content for doc in relevant_docs])
        print("Semantic Search Completed")
        print(f"Total number of relevant text: {len(relevant_text)}\n")
        
        return self._analyze_text(relevant_text, state, ground_query)

    def run_models(self):
        result_1 = self._analyze_with_semantic_search(self.sm_data, state=self.state[0])
        result_2 = self._analyze_with_semantic_search(self.amzn_reviews, state=self.state[1])
    
        sentiment_summary = {**result_1, **result_2}

        # Transforming into a textual query
        sentiment_query = " ".join([f"{k}: {v}" for k, v in sentiment_summary.items()])

        # Temporarily override self.query to use summary as query
        old_query = self.sub_query
        self.sub_query = sentiment_query
        result_3 = self._analyze_with_semantic_search(self.amzn_prod_details, state=self.state[2], ground_query=old_query)
        self.sub_query = old_query
        
        final_result = {**sentiment_summary, **result_3}
    
        return final_result
        print("--------------------------------------------------------------------------------------------------------------------")

    def evaluate_model(self, ground_truth=None):
        results = self.run_models()

        assert "Social media sentiment" in results
        assert "Amazon review sentiment" in results
        assert "Recommendation" in results

        if ground_truth:
            from sklearn.model_selection import classification_report

            pred_labels = self._convert_sentiment_to_labels(results)
            true_labels = ground_truth

            return classification_report(pred_labels, true_labels)
        
        return "Evaluation passed basic checks"    

##### Use cases

##### 2015 Data

In [179]:
product_pipeline = ProductReview("Best camera option for a big event", amzn_reviews=amzn_review_data, amzn_prod_details=amzn_prod_data)

In [180]:
product_pipeline.run_models()

First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 809

Modelling Social Media Sentiment...
Result Generation Completed 
Result: {
    "Social media sentiment": [
        "- The user is looking for a camera that can take quality photos and edit raw files.",
        "- They are considering between buying a camera or an iPhone 16 pro max.",
        "- The user wants a camera that is better than what iPhone offers."
    ],
    "Sentiment scores (%)": "0|0|100"
}
Now Parsing...
Parsing completed
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 693

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result: {
    "Amazon review sentiment": [
        "- Outstanding IQ on the camera",
        "- Great for event shooting",
        "- Fantastic gift for special occasions"
    ],
    "Sentiment scores (%)": 

{'Social media sentiment': ['- The user is looking for a camera that can take quality photos and edit raw files.',
  '- They are considering between buying a camera or an iPhone 16 pro max.',
  '- The user wants a camera that is better than what iPhone offers.'],
 'Sentiment scores (%)': '70|0|30',
 'Amazon review sentiment': ['- Outstanding IQ on the camera',
  '- Great for event shooting',
  '- Fantastic gift for special occasions'],
 'Recommendation': 'Consider purchasing the Sony SRS-33 speaker for its outstanding sound quality and subtle design. It is a great compromise between clunky headphones and earbuds, offering superior sound under $200. The only downside is the average battery life, but overall highly recommended for its audio performance.'}

In [184]:
product_pipeline = ProductReview("Looking for a good phone with quality camera, storage and processor", amzn_reviews=amzn_review_data, amzn_prod_details=amzn_prod_data)

In [186]:
product_pipeline.run_models()

First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 897

Modelling Social Media Sentiment...
Result Generation Completed 
Result: {
    "Social media sentiment": [
        "- Looking for a relatively cheap camera with clear quality for travel photos and posts.",
        "- Sometimes unable to connect to the phone via wi-fi.",
        "- Looking for suggestions for a new camera and how to take good sky pics."
    ],
    "Sentiment scores (%)": "0|0|100"
}
Now Parsing...
Parsing completed
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 815

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result: {
    "Amazon review sentiment": [
        "- Good quality sound on inexpensive headphone",
        "- Very fast processor with LGA2011 socket",
        "- Comes with good quality storage bag and instructi

{'Social media sentiment': ['- Looking for a relatively cheap camera with clear quality for travel photos and posts.',
  '- Sometimes unable to connect to the phone via wi-fi.',
  '- Looking for suggestions for a new camera and how to take good sky pics.'],
 'Sentiment scores (%)': '70|0|30',
 'Amazon review sentiment': ['- Good quality sound on inexpensive headphone',
  '- Very fast processor with LGA2011 socket',
  '- Comes with good quality storage bag and instruction sheet'],
 'Recommendation': 'Consider purchasing the Sony DSXS300BTX digital media receiver for good quality sound and Bluetooth connectivity'}

In [188]:
product_pipeline = ProductReview("Should I get an iphone 6 or something better",amzn_reviews=amzn_review_data, amzn_prod_details=amzn_prod_data)

In [190]:
product_pipeline.run_models()

First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 615

Modelling Social Media Sentiment...
Result Generation Completed 
Result: {
    "Social media sentiment": [
        "- Excited about honeymoon in Europe",
        "- Considering upgrading to iPhone 13 Pro Max",
        "- Unsure about camera options for photography"
    ],
    "Sentiment scores (%)": "50|10|40"
}
Now Parsing...
Parsing completed
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 587

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result: {
    "Amazon review sentiment": [
        "- Convenient to connect to iPhone or laptop",
        "- Better for using on the job than an iPhone or iPod touch",
        "- Some issues with fitting in an iPhone 6 lifeproof case port"
    ],
    "Sentiment scores (%)": "60|10|30"
}
Now Parsing..

{'Social media sentiment': ['- Excited about honeymoon in Europe',
  '- Considering upgrading to iPhone 13 Pro Max',
  '- Unsure about camera options for photography'],
 'Sentiment scores (%)': '60|10|30',
 'Amazon review sentiment': ['- Convenient to connect to iPhone or laptop',
  '- Better for using on the job than an iPhone or iPod touch',
  '- Some issues with fitting in an iPhone 6 lifeproof case port'],
 'Recommendation': 'Upgrade to iPhone 13 Pro Max for better camera options and connectivity'}

##### 2023 Data

In [173]:
query = "Laptop"
sub_query = "What kind of laptop should I buy for someone interested in high processing power and graphics for gaming"
product_pipeline = ProductReview(query, sub_query, amzn_reviews=amzn_review_data_2023, amzn_prod_details=amzn_prod_data_2023)

In [175]:
product_pipeline.run_models()

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 1264

Modelling Social Media Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 1128

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

-----------------------------

{'Social media sentiment': ['- Users appreciate the high processing power and RAM of the 16 Pro for gaming',
  "- Some users are frustrated with the laptop's inability to accept certain flash cards",
  '- There are conflicting explanations from representatives regarding optical zoom capabilities'],
 'Sentiment scores (%)': '60|30|10',
 'Amazon review sentiment': ['- The laptop offers high processing power and graphics performance, ideal for gaming enthusiasts',
  '- Users praise the GPU Turbo feature for boosting graphics efficiency and stability during gaming',
  '- Some customers mention concerns about overheating during prolonged gaming sessions'],
 'Recommendation': ['- Surface Laptop Studio is suitable because it offers high processing power and graphics performance, ideal for gaming enthusiasts.',
  '- HP Envy X360 is also a good fit due to its powerful GPU Turbo feature for boosting graphics efficiency and stability during gaming.']}

In [176]:
query = "Digital Camera"
sub_query = "Looking for a techy event camera (not mobile) that I can sync nicely with other gadgets"
product_pipeline = ProductReview(query, sub_query, amzn_reviews=amzn_review_data_2023, amzn_prod_details=amzn_prod_data_2023)

In [177]:
product_pipeline.run_models()

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 465

Modelling Social Media Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 2164

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

------------------------------

{'Social media sentiment': ['- Users appreciate the digital stabilization feature for clear quality travel photos',
  '- Users are looking for a durable camera that can withstand day trips and nights out',
  '- Users desire a camera that can handle sunlight well'],
 'Sentiment scores (%)': '60|20|20',
 'Amazon review sentiment': ['- Customers praise the Panasonic LUMIX DMC-TS25 Digital Camera for its durability and waterproof features',
  "- Some users mention issues with the camera's image quality in low light conditions",
  '- The compact camera case receives positive feedback for its protective design and included hand strap'],
 'Recommendation': ['- The Panasonic LUMIX DMC-TS25 Digital Camera is a great fit for your needs as it offers durability and waterproof features, making it suitable for techy events and syncing with other gadgets.',
  '- Another option to consider is the Samsung Galaxy S23 Ultra, known for its high-quality photographic capabilities and compatibility with othe

In [178]:
query = "Charger"
sub_query = "Super fast charger"
product_pipeline = ProductReview(query, sub_query, amzn_reviews=amzn_review_data_2023, amzn_prod_details=amzn_prod_data_2023)

In [179]:
product_pipeline.run_models()

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 1017

Modelling Social Media Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------
First layer BM25 Retriever Corpus Reduction Complete
Embeddings vector created..
Semantic Search Completed
Total number of relevant text: 4918

Modelling Amazon Review Sentiment...
Result Generation Completed 
Result:
Now Parsing...
Parsing completed
--------------------------------------------------------------------------------------------------------------------

-----------------------------

{'Social media sentiment': ["- Users praise the fast charging capabilities of Ugreen's Nexode 140W Charger",
  '- Some users express excitement about the innovative features of ESR MagSafe accessories',
  '- One user mentions being new to using non-iPhone devices and questions the speed of their USB 2 reader'],
 'Sentiment scores (%)': '80|10|10',
 'Amazon review sentiment': ['- Customers praise the BoxWave Charger for its rapid-charging capabilities, allowing for quick use of devices',
  '- The charger is commended for its compatibility with both North American and international voltage standards, making it ideal for travelers',
  '- Users appreciate the green LED indicator that ensures proper charging of devices'],
 'Recommendation': ["- The Ugreen's Nexode 140W Charger is a great choice for a super fast charger, as users praise its fast charging capabilities.",
  '- The BoxWave Charger is another excellent option known for its rapid-charging capabilities and compatibility with inter

----------------------------